From 1f5568f3b7102677aa97783331e05d828aa4c766 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 5 Nov 2025 14:21:53 +0100
Subject: [PATCH 01/68] do not auto add default qos class (#720)

* added 3 secs sleep before add qos class

* wip

* Revert "wip"

This reverts commit b0f2ba695f778f5eb177fab8a474210e12b0c69a.

* increase sleep time

* remove auto add default qos class

---------

Co-authored-by: hamdykhader <hamdy.khader@gmail.com>
---
 simplyblock_core/cluster_ops.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index 103123934..fbe91a58f 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -371,8 +371,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass,
 
     cluster.write_to_db(db_controller.kv_store)
 
-    qos_controller.add_class("Default", 100, cluster.get_id())
-
     cluster_events.cluster_create(cluster)
 
     mgmt_node_ops.add_mgmt_node(dev_ip, mode, cluster.uuid)

From 3af633b009d706c07e7526e36c5ab744c28e5041 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 5 Nov 2025 14:30:53 +0100
Subject: [PATCH 02/68] Update env_var (#721)

---
 simplyblock_core/env_var | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var
index f3e377ee4..e1d2e2f8b 100644
--- a/simplyblock_core/env_var
+++ b/simplyblock_core/env_var
@@ -1,5 +1,5 @@
 SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev
-SIMPLY_BLOCK_VERSION=19.2.23
+SIMPLY_BLOCK_VERSION=19.2.24
 
 SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main
 SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest

From ad546ca5fe667a74a5559109fb0e7c58d3a707b0 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 11 Nov 2025 10:04:15 +0300
Subject: [PATCH 03/68] Enable ndcs and npcs when creating lvol (#729)

---
 simplyblock_core/rpc_client.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py
index 62f37b1e9..66ef478f8 100644
--- a/simplyblock_core/rpc_client.py
+++ b/simplyblock_core/rpc_client.py
@@ -379,11 +379,11 @@ def create_lvol(self, name, size_in_mib, lvs_name, lvol_priority_class=0, ndcs=0
             "clear_method": "unmap",
             "lvol_priority_class": lvol_priority_class,
         }
-        # if ndcs or npcs:
-        #     params.update({
-        #         'ndcs' : ndcs,
-        #         'npcs' : npcs,
-        #     })
+        if ndcs or npcs:
+            params.update({
+                'ndcs' : ndcs,
+                'npcs' : npcs,
+            })
         return self._request("bdev_lvol_create", params)
 
     def delete_lvol(self, name, del_async=False):

From 5f6382b008c437d67b8e92d9a7324a1391b3dd7a Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 11 Nov 2025 11:05:47 +0300
Subject: [PATCH 04/68] Fix sfam-2450 cluster update issues (#726)

- set cluster mode to default "docker"
- remove service "app_CachingNodeMonitor" from services during cluster update
---
 simplyblock_core/cluster_ops.py    | 10 +++++++---
 simplyblock_core/models/cluster.py |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index fbe91a58f..ff07e6634 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -1174,9 +1174,13 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
         for service in cluster_docker.services.list():
             if image_parts in service.attrs['Spec']['Labels']['com.docker.stack.image'] or \
             "simplyblock" in service.attrs['Spec']['Labels']['com.docker.stack.image']:
-                logger.info(f"Updating service {service.name}")
-                service.update(image=service_image, force_update=True)
-                service_names.append(service.attrs['Spec']['Name'])
+                if service.name == "app_CachingNodeMonitor":
+                    logger.info(f"Removing service {service.name}")
+                    service.remove()
+                else:
+                    logger.info(f"Updating service {service.name}")
+                    service.update(image=service_image, force_update=True)
+                    service_names.append(service.attrs['Spec']['Name'])
 
         if "app_SnapshotMonitor" not in service_names:
             logger.info("Creating snapshot monitor service")
diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py
index fd4802771..620309f77 100644
--- a/simplyblock_core/models/cluster.py
+++ b/simplyblock_core/models/cluster.py
@@ -45,7 +45,7 @@ class Cluster(BaseModel):
     distr_npcs: int = 0
     enable_node_affinity: bool = False
     grafana_endpoint: str = ""
-    mode: str = ""
+    mode: str = "docker"
     grafana_secret: str = ""
     contact_point: str = ""
     ha_type: str = "single"

From 4a6a4d70dc771fc502fdd94b501c93b1bfaba75f Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Tue, 11 Nov 2025 09:24:24 +0100
Subject: [PATCH 05/68] Update Dockerfile_base (#730)

* Update Dockerfile_base

* Update Dockerfile_base
---
 docker/Dockerfile_base | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base
index 226188c96..201d92759 100644
--- a/docker/Dockerfile_base
+++ b/docker/Dockerfile_base
@@ -38,3 +38,5 @@ RUN pip3 install setuptools --upgrade
 COPY requirements.txt requirements.txt
 
 RUN pip3 install -r requirements.txt
+
+RUN rm -rf /usr/share/terminfo/

From 3b54c88e0fab8da8dce720f511ae03e5461b0e3f Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Tue, 11 Nov 2025 22:08:54 +0300
Subject: [PATCH 06/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 .../controllers/lvol_migration_controller.py  | 251 ++++++++++++++++++
 simplyblock_core/models/lvol_migration.py     |  48 ++++
 2 files changed, 299 insertions(+)
 create mode 100644 simplyblock_core/controllers/lvol_migration_controller.py
 create mode 100644 simplyblock_core/models/lvol_migration.py

diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
new file mode 100644
index 000000000..71cb30426
--- /dev/null
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -0,0 +1,251 @@
+# Ticket description for live lvol migration:
+# Live lvol migration moves lvols together with all related objects
+# (related snapshots, related clones) from one storage node to another
+# storage node in the same cluster. This happens online and very fast,
+# as no actual data is copied.
+#
+# It is NOT possible:
+# - to move snapshots or clones independently from the lvol
+# - to move namespace lvols belonging to the same subsystem independently
+#
+# We need to implement this feature in control plane in two steps:
+# a) move a specific lvol and its related objects based on the lvol name
+#    or uuid from one node to another node. The other node must be online
+#    and it must not be the secondary of the node the lvol is currently attached to.
+# b) create an automatism, which periodically controls the balance of
+#    performance and ram consumption across nodes and re-balances certain
+#    lvols if a node becomes over-loaded
+
+import asyncio
+from typing import Iterable
+
+from ..models.lvol_migration import (
+    LvolMigration,
+    MigrationItem,
+    MigrationState,
+    StorageObject
+)
+from enum import Enum
+
+
+class ObjType(Enum):
+    SNAPSHOT = "snapshot"
+    CLONE = "clone"
+    LVOL = "lvol"
+
+
+class MigrationQueue(LvolMigration):
+    def add_object(self, storage_obj: StorageObject, obj_type: ObjType):
+        item = MigrationItem(storage=storage_obj, state=MigrationState.NEW)
+        item.type = obj_type
+        self.migrations.append(item)
+        return item
+
+    def iter_snapshots(self):
+        return (m for m in self.migrations if m.type == ObjType.SNAPSHOT)
+
+    def iter_clones(self):
+        return (m for m in self.migrations if m.type == ObjType.CLONE)
+
+    def iter_lvol(self):
+        return (m for m in self.migrations if m.type == ObjType.LVOL)
+
+
+# -------------------------------------------------------------------------
+# Async-capable Controller
+# -------------------------------------------------------------------------
+
+class LvolMigrationController:
+
+    # ---------------------------------------------------------------------
+    # Public entry point
+    # ---------------------------------------------------------------------
+
+    async def migrate_lvol(self, lvol) -> str:
+        mq = self.create_migration_queue(lvol)
+
+        if self.all_nodes_online():
+            self.freeze_snapshots_clones(lvol)
+
+            result = await self.process_migration_queue(mq)
+
+            if result != "DONE":
+                self.register_continue(mq)
+                return "SUSPENDED"
+
+            self.unfreeze_snapshots_clones(lvol)
+            return "DONE"
+
+        return "SUSPENDED"
+
+    # ---------------------------------------------------------------------
+
+    def create_migration_queue(self, lvol) -> MigrationQueue:
+        mq = MigrationQueue(
+            primary_source=lvol.primary,
+            secondary_source=lvol.secondary,
+            primary_target=lvol.target_primary,
+            secondary_target=lvol.target_secondary,
+        )
+
+        for s in lvol.get_snapshots():
+            mq.add_object(s, ObjType.SNAPSHOT)
+
+        for c in lvol.get_clones():
+            mq.add_object(c, ObjType.CLONE)
+
+        mq.add_object(lvol, ObjType.LVOL)
+        return mq
+
+    # ---------------------------------------------------------------------
+    # Core logic with asyncio
+    # ---------------------------------------------------------------------
+
+    async def process_migration_queue(self, mq: MigrationQueue) -> str:
+
+        if not await self._process_subset(mq, mq.iter_snapshots()):
+            return "CANCELED"
+
+        if not await self._process_subset(mq, mq.iter_clones()):
+            return "CANCELED"
+
+        if not await self._process_subset(mq, mq.iter_lvol()):
+            return "CANCELED"
+
+        return "DONE"
+
+    # ---------------------------------------------------------------------
+
+    async def _process_subset(self, mq: MigrationQueue, iterator: Iterable[MigrationItem]) -> bool:
+        tasks = []
+
+        for item in iterator:
+            if item.state in (MigrationState.NEW, MigrationState.IN_MIGRATION):
+                item.state = MigrationState.IN_MIGRATION
+                tasks.append(asyncio.create_task(self.migrate_object(item)))
+
+        if not tasks:
+            return True
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        if not self.all_nodes_online():
+            return False
+
+        # Check for errors
+        for r in results:
+            if isinstance(r, Exception):
+                return False
+            if getattr(r, "failed", False):
+                return False
+
+        # Mark items done
+        for item in iterator:
+            if item.state == MigrationState.IN_MIGRATION:
+                item.state = MigrationState.MIGRATED
+
+        return True
+
+    # ---------------------------------------------------------------------
+    # Cleanup
+    # ---------------------------------------------------------------------
+
+    async def cleanup_migration_queue(self, mq: MigrationQueue, lvol):
+        mq.status = "IN_DELETION"
+
+        tasks = []
+        for item in mq.migrations:
+            if item.state != MigrationState.NEW:
+                tasks.append(asyncio.create_task(
+                    self.async_delete(item.storage, mq.primary_target)
+                ))
+                tasks.append(asyncio.create_task(
+                    self.register_syn_delete(item.storage, mq.secondary_target)
+                ))
+
+        if tasks:
+            await asyncio.gather(*tasks, return_exceptions=True)
+
+        mq.status = "DELETED"
+        self.unfreeze_snapshots_clones(lvol)
+
+    # ---------------------------------------------------------------------
+    # Individual migration operations (async)
+    # ---------------------------------------------------------------------
+
+    async def migrate_object(self, item: MigrationItem):
+        src = item.storage.source_node
+        dst = item.storage.target_node
+
+        await self.create_target_namespace(dst, item.storage)
+        await self.connect_source_to_target(src, dst, item.storage)
+
+        if item.type == ObjType.SNAPSHOT:
+            return await self._migrate_snapshot(item)
+        elif item.type == ObjType.CLONE:
+            return await self._migrate_clone(item)
+        else:
+            return await self._migrate_lvol(item)
+
+    # ---------------------------------------------------------------------
+    # Snapshot migration with retries
+    # ---------------------------------------------------------------------
+
+    async def _migrate_snapshot(self, item):
+        for attempt in range(5):
+            result = await self.run_migration_rpc(item)
+            if result.success:
+                return result
+
+            if not self.all_nodes_online():
+                break
+
+            await asyncio.sleep(self.retry_delay(attempt))
+
+        return result
+
+    async def _migrate_clone(self, item):
+        return await self.run_migration_rpc(item)
+
+    async def _migrate_lvol(self, item):
+        return await self.run_migration_rpc(item)
+
+    # ---------------------------------------------------------------------
+    # Placeholder hooks (inject actual system implementation)
+    # ---------------------------------------------------------------------
+
+    async def create_target_namespace(self, dst, storage):
+        pass
+
+    async def connect_source_to_target(self, src, dst, storage):
+        pass
+
+    async def run_migration_rpc(self, item):
+        """
+        Must return an object with fields:
+            .success  -> bool
+            .failed   -> bool
+        or raise exception.
+        """
+        pass
+
+    async def async_delete(self, storage, target):
+        pass
+
+    async def register_syn_delete(self, storage, target):
+        pass
+
+    def freeze_snapshots_clones(self, lvol):
+        pass
+
+    def unfreeze_snapshots_clones(self, lvol):
+        pass
+
+    def all_nodes_online(self) -> bool:
+        pass
+
+    def register_continue(self, mq):
+        pass
+
+    def retry_delay(self, attempt: int) -> float:
+        return min(2 ** attempt, 60)   # exponential backoff
diff --git a/simplyblock_core/models/lvol_migration.py b/simplyblock_core/models/lvol_migration.py
new file mode 100644
index 000000000..9cfa04d04
--- /dev/null
+++ b/simplyblock_core/models/lvol_migration.py
@@ -0,0 +1,48 @@
+from enum import Enum
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+from simplyblock_core.models.lvol_model import LVol
+from simplyblock_core.models.snapshot import SnapShot
+from simplyblock_core.models.storage_node import StorageNode
+
+
+class MigrationState(Enum):
+    NEW = "new"
+    IN_MIGRATION = "in-migration"
+    MIGRATED = "migrated"
+
+
+@dataclass
+class StorageObject:
+    """Represents an lvol/clone/snapshot or similar."""
+    id: str
+    lvol_ref: LVol
+    snap_ref: SnapShot
+    type: str  # e.g., "lvol", "clone", "snapshot"
+
+
+@dataclass
+class MigrationItem:
+    """A single storage-object migration entry."""
+    storage: StorageObject
+    state: MigrationState = MigrationState.NEW
+
+@dataclass
+class LvolMigration:
+    """Model representing a full logical-volume migration plan."""
+    primary_source: StorageNode
+    secondary_source: StorageNode
+    primary_target: StorageNode
+    secondary_target: StorageNode
+    migrations: List[MigrationItem] = field(default_factory=list)
+
+    def add_migration(self, storage: StorageObject) -> None:
+        self.migrations.append(MigrationItem(storage))
+
+    def update_state(self, storage_id: str, new_state: MigrationState) -> None:
+        for item in self.migrations:
+            if item.storage.id == storage_id:
+                item.state = new_state
+                return
+        raise ValueError(f"No migration item with storage id={storage_id}")

From bf56cb67efb05db6aada28bf7d05b42268a8acc8 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 12 Nov 2025 19:22:03 +0100
Subject: [PATCH 07/68] inherit default cluster mode in new cluster (#733)

* inherit default cluster mode in new cluster

* added first six cluster id char to spdk pod name

* added first six cluster id char to spdk pod name

* update env_var
---
 simplyblock_core/cluster_ops.py               |  1 +
 simplyblock_core/env_var                      |  2 +-
 simplyblock_core/snode_client.py              |  8 ++++---
 simplyblock_core/storage_node_ops.py          | 10 ++++-----
 simplyblock_core/utils/__init__.py            |  8 ++++++-
 .../api/internal/storage_node/docker.py       |  1 +
 .../api/internal/storage_node/kubernetes.py   | 21 ++++++++++++-------
 .../templates/storage_deploy_spdk.yaml.j2     |  2 +-
 simplyblock_web/utils.py                      |  1 +
 9 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index ff07e6634..536546eab 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -457,6 +457,7 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn
     cluster.strict_node_anti_affinity = strict_node_anti_affinity
 
     default_cluster = clusters[0]
+    cluster.mode = default_cluster.mode
     cluster.db_connection = default_cluster.db_connection
     cluster.grafana_secret = monitoring_secret if default_cluster.mode == "kubernetes" else default_cluster.grafana_secret
     cluster.grafana_endpoint = default_cluster.grafana_endpoint
diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var
index e1d2e2f8b..cf8093b7c 100644
--- a/simplyblock_core/env_var
+++ b/simplyblock_core/env_var
@@ -1,6 +1,6 @@
 SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev
 SIMPLY_BLOCK_VERSION=19.2.24
 
-SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main
+SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:inherit_default_cluster_mode
 SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest
 
diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py
index 2e8504b08..5e5f66f60 100644
--- a/simplyblock_core/snode_client.py
+++ b/simplyblock_core/snode_client.py
@@ -81,7 +81,7 @@ def info(self):
     def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None, cluster_ip=None,
                            fdb_connection=None, namespace=None, server_ip=None, rpc_port=None,
                            rpc_username=None, rpc_password=None, multi_threading_enabled=False, timeout=0, ssd_pcie=None,
-                           total_mem=None, system_mem=None, cluster_mode=None):
+                           total_mem=None, system_mem=None, cluster_mode=None, cluster_id=None):
         params = {
             "cluster_ip": cluster_ip,
             "server_ip": server_ip,
@@ -113,6 +113,8 @@ def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None
             params["system_mem"] = system_mem
         if cluster_mode:
             params["cluster_mode"] = cluster_mode
+        if cluster_id:
+            params["cluster_id"] = cluster_id
         return self._request("POST", "spdk_process_start", params)
 
     def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id):
@@ -124,8 +126,8 @@ def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id):
         #     "db_connection": db_connection}
         # return self._request("POST", "join_swarm", params)
 
-    def spdk_process_kill(self, rpc_port):
-        return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port})
+    def spdk_process_kill(self, rpc_port, cluster_id=None):
+        return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port, "cluster_id": cluster_id})
 
     def leave_swarm(self):
         return True
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index 3d32dd17a..162f0dd1a 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -998,7 +998,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list,
                 namespace, mgmt_ip, rpc_port, rpc_user, rpc_pass,
                 multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED,
                 timeout=constants.SPDK_PROXY_TIMEOUT,
-                ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode)
+                ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=cluster_id)
             time.sleep(5)
 
         except Exception as e:
@@ -1454,7 +1454,7 @@ def remove_storage_node(node_id, force_remove=False, force_migrate=False):
         if health_controller._check_node_api(snode.mgmt_ip):
             logger.info("Stopping SPDK container")
             snode_api = SNodeClient(snode.api_endpoint, timeout=20)
-            snode_api.spdk_process_kill(snode.rpc_port)
+            snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id)
             snode_api.leave_swarm()
             pci_address = []
             for dev in snode.nvme_devices:
@@ -1676,7 +1676,7 @@ def restart_storage_node(
             snode.l_cores, snode.spdk_mem, snode.spdk_image, spdk_debug, cluster_ip, fdb_connection,
             snode.namespace, snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password,
             multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT,
-            ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode)
+            ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=snode.cluster_id)
 
     except Exception as e:
         logger.error(e)
@@ -2250,7 +2250,7 @@ def shutdown_storage_node(node_id, force=False):
 
     logger.info("Stopping SPDK")
     try:
-        SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port)
+        SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port, snode.cluster_id)
     except SNodeClientException:
         logger.error('Failed to kill SPDK')
         return False
@@ -3214,7 +3214,7 @@ def recreate_lvstore(snode, force=False):
     def _kill_app():
         storage_events.snode_restart_failed(snode)
         snode_api = SNodeClient(snode.api_endpoint, timeout=5, retry=5)
-        snode_api.spdk_process_kill(snode.rpc_port)
+        snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id)
         set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE)
 
     # If LVol Store recovery failed then stop spdk process
diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py
index 941414708..0892db54a 100644
--- a/simplyblock_core/utils/__init__.py
+++ b/simplyblock_core/utils/__init__.py
@@ -725,7 +725,13 @@ def convert_size(size: Union[int, str], unit: str, round_up: bool = False) -> in
     raw = size / (base ** exponent)
     return math.ceil(raw) if round_up else int(raw)
 
-
+def first_six_chars(s: str) -> str:
+    """
+    Returns the first six characters of a given string.
+    If the string is shorter than six characters, returns the entire string.
+    """
+    return s[:6]
+    
 def nearest_upper_power_of_2(n):
     # Check if n is already a power of 2
     if (n & (n - 1)) == 0:
diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py
index 8e18fc276..d1ee4f9f0 100644
--- a/simplyblock_web/api/internal/storage_node/docker.py
+++ b/simplyblock_web/api/internal/storage_node/docker.py
@@ -142,6 +142,7 @@ class SPDKParams(BaseModel):
     spdk_image: Optional[str] = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE)
     cluster_ip: Optional[str] = Field(default=None, pattern=utils.IP_PATTERN)
     cluster_mode: str
+    cluster_id: str
 
 
 @api.post('/spdk_process_start', responses={
diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py
index be3193138..56b4ca563 100644
--- a/simplyblock_web/api/internal/storage_node/kubernetes.py
+++ b/simplyblock_web/api/internal/storage_node/kubernetes.py
@@ -268,6 +268,7 @@ class SPDKParams(BaseModel):
     spdk_image: str = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE)
     cluster_ip: str = Field(pattern=utils.IP_PATTERN)
     cluster_mode: str
+    cluster_id: str
 
 
 @api.post('/spdk_process_start', responses={
@@ -286,9 +287,10 @@ def spdk_process_start(body: SPDKParams):
 
     total_mem_mib = core_utils.convert_size(core_utils.parse_size(body.total_mem), 'MB') if body.total_mem else ""
 
-    if _is_pod_up(body.rpc_port) or _is_pod_present(body.rpc_port):
+    first_six_cluster_id = core_utils.first_six_chars(body.cluster_id)
+    if _is_pod_up(body.rpc_port, first_six_cluster_id) or _is_pod_present(body.rpc_port, first_six_cluster_id):
         logger.info("SPDK pod found, removing...")
-        query = utils.RPCPortParams(rpc_port=body.rpc_port)
+        query = utils.RPCPortParams(rpc_port=body.rpc_port, cluster_id=body.cluster_id)
         spdk_process_kill(query)
 
     node_prepration_job_name = "snode-spdk-job-"
@@ -351,6 +353,7 @@ def spdk_process_start(body: SPDKParams):
             'SIMPLYBLOCK_DOCKER_IMAGE': constants.SIMPLY_BLOCK_DOCKER_IMAGE,
             'GRAYLOG_SERVER_IP': body.cluster_ip,
             'MODE': body.cluster_mode,
+            'CLUSTER_ID': first_six_cluster_id,
             'SSD_PCIE': ssd_pcie_params,
             'PCI_ALLOWED': ssd_pcie_list,
             'TOTAL_HP': total_mem_mib
@@ -463,7 +466,8 @@ def spdk_process_kill(query: utils.RPCPortParams):
     k8s_core_v1 = core_utils.get_k8s_core_client()
     try:
         namespace = node_utils_k8s.get_namespace()
-        pod_name = f"snode-spdk-pod-{query.rpc_port}"
+        first_six_cluster_id = core_utils.first_six_chars(query.cluster_id)
+        pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}"
         resp = k8s_core_v1.delete_namespaced_pod(pod_name, namespace)
         retries = 10
         while retries > 0:
@@ -486,9 +490,9 @@ def spdk_process_kill(query: utils.RPCPortParams):
     return utils.get_response(True)
 
 
-def _is_pod_up(rpc_port):
+def _is_pod_up(rpc_port, cluster_id):
     k8s_core_v1 = core_utils.get_k8s_core_client()
-    pod_name = f"snode-spdk-pod-{rpc_port}"
+    pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}"
     try:
         resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace())
         for pod in resp.items:
@@ -502,9 +506,9 @@ def _is_pod_up(rpc_port):
         return False
     return False
 
-def _is_pod_present(rpc_port):
+def _is_pod_present(rpc_port, cluster_id):
     k8s_core_v1 = core_utils.get_k8s_core_client()
-    pod_name = f"snode-spdk-pod-{rpc_port}"
+    pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}"
     try:
         resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace())
         for pod in resp.items:
@@ -525,7 +529,8 @@ def _is_pod_present(rpc_port):
     })}}},
 })
 def spdk_process_is_up(query: utils.RPCPortParams):
-    if _is_pod_up(query.rpc_port):
+    first_six_cluster_id = core_utils.first_six_chars(query.cluster_id)
+    if _is_pod_up(query.rpc_port, first_six_cluster_id):
         return utils.get_response(True)
     else:
         return utils.get_response(False, "SPDK container is not running")
diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
index f10478c75..e49aca2e2 100644
--- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
+++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
@@ -1,7 +1,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  name: snode-spdk-pod-{{ RPC_PORT }}
+  name: snode-spdk-pod-{{ RPC_PORT }}-{{ CLUSTER_ID }}
   namespace: {{ NAMESPACE }}
   labels:
     app: spdk-app-{{ RPC_PORT }}
diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py
index b0d1795df..27ff2ce18 100644
--- a/simplyblock_web/utils.py
+++ b/simplyblock_web/utils.py
@@ -149,6 +149,7 @@ def error_handler(exception: Exception):
 
 class RPCPortParams(BaseModel):
     rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536)
+    cluster_id: str
 
 
 class DeviceParams(BaseModel):

From 0e72282a7ea442e8a5b1d8ce4ecf882edbc1a1c0 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Wed, 12 Nov 2025 21:49:16 +0300
Subject: [PATCH 08/68] Update environment variables for Simply Block (#737)

---
 simplyblock_core/env_var | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var
index cf8093b7c..fe494ca34 100644
--- a/simplyblock_core/env_var
+++ b/simplyblock_core/env_var
@@ -1,6 +1,6 @@
 SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev
-SIMPLY_BLOCK_VERSION=19.2.24
+SIMPLY_BLOCK_VERSION=19.2.25
 
-SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:inherit_default_cluster_mode
+SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main
 SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest
 

From 25e3dd29b5fa5b345e73cb905cf169fc021d6eff Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Thu, 13 Nov 2025 09:44:28 +0300
Subject: [PATCH 09/68] Main lvol sync delete (#734)

* Add lvol sync delete task runner

* fix linter issues

* fix
---
 simplyblock_core/cluster_ops.py               | 12 +++
 .../controllers/tasks_controller.py           | 23 ++++++
 simplyblock_core/env_var                      |  2 +-
 simplyblock_core/models/job_schedule.py       |  1 +
 simplyblock_core/models/storage_node.py       |  1 -
 .../scripts/docker-compose-swarm.yml          | 14 ++++
 simplyblock_core/services/lvol_monitor.py     | 18 +----
 simplyblock_core/services/snapshot_monitor.py |  5 +-
 .../services/tasks_runner_port_allow.py       | 22 ++----
 .../services/tasks_runner_sync_lvol_del.py    | 77 +++++++++++++++++++
 10 files changed, 140 insertions(+), 35 deletions(-)
 create mode 100644 simplyblock_core/services/tasks_runner_sync_lvol_del.py

diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index 536546eab..dc429b8f9 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -1194,6 +1194,18 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
                 networks=["host"],
                 constraints=["node.role == manager"]
             )
+
+        if "app_TasksRunnerLVolSyncDelete" not in service_names:
+            logger.info("Creating lvol sync delete service")
+            cluster_docker.services.create(
+                image=service_image,
+                command="python simplyblock_core/services/tasks_runner_sync_lvol_del.py",
+                name="app_TasksRunnerLVolSyncDelete",
+                mounts=["/etc/foundationdb:/etc/foundationdb"],
+                env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"],
+                networks=["host"],
+                constraints=["node.role == manager"]
+            )
         logger.info("Done updating mgmt cluster")
 
     elif cluster.mode == "kubernetes":
diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py
index 689027d08..b7c434f63 100644
--- a/simplyblock_core/controllers/tasks_controller.py
+++ b/simplyblock_core/controllers/tasks_controller.py
@@ -70,6 +70,11 @@ def _add_task(function_name, cluster_id, node_id, device_id,
         if task_id:
             logger.info(f"Task found, skip adding new task: {task_id}")
             return False
+    elif function_name == JobSchedule.FN_LVOL_SYNC_DEL:
+        task_id = get_lvol_sync_del_task(cluster_id, node_id, function_params['lvol_bdev_name'])
+        if task_id:
+            logger.info(f"Task found, skip adding new task: {task_id}")
+            return False
 
     task_obj = JobSchedule()
     task_obj.uuid = str(uuid.uuid4())
@@ -386,3 +391,21 @@ def get_jc_comp_task(cluster_id, node_id, jm_vuid=0):
                 if jm_vuid and "jm_vuid" in task.function_params and task.function_params["jm_vuid"] == jm_vuid:
                     return task.uuid
     return False
+
+
+def add_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name):
+    return _add_task(JobSchedule.FN_LVOL_SYNC_DEL, cluster_id, node_id, "",
+                     function_params={"lvol_bdev_name": lvol_bdev_name}, max_retry=10)
+
+def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None):
+    tasks = db.get_job_tasks(cluster_id)
+    for task in tasks:
+        if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL and task.node_id == node_id :
+            if task.status != JobSchedule.STATUS_DONE and task.canceled is False:
+                if lvol_bdev_name:
+                    if "lvol_bdev_name" in task.function_params and task.function_params["lvol_bdev_name"] == lvol_bdev_name:
+                        return task.uuid
+                else:
+                    return task.uuid
+    return False
+
diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var
index fe494ca34..468ba7a02 100644
--- a/simplyblock_core/env_var
+++ b/simplyblock_core/env_var
@@ -1,6 +1,6 @@
 SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev
 SIMPLY_BLOCK_VERSION=19.2.25
 
-SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main
+SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main-lvol-sync-delete
 SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest
 
diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py
index 3d87a9aca..bbdcd7871 100644
--- a/simplyblock_core/models/job_schedule.py
+++ b/simplyblock_core/models/job_schedule.py
@@ -22,6 +22,7 @@ class JobSchedule(BaseModel):
     FN_BALANCING_AFTER_DEV_REMOVE = "balancing_on_dev_rem"
     FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add"
     FN_JC_COMP_RESUME = "jc_comp_resume"
+    FN_LVOL_SYNC_DEL = "lvol_sync_del"
 
     canceled: bool = False
     cluster_id: str = ""
diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py
index 8c76d3649..81639c556 100644
--- a/simplyblock_core/models/storage_node.py
+++ b/simplyblock_core/models/storage_node.py
@@ -102,7 +102,6 @@ class StorageNode(BaseNodeObject):
     hublvol: HubLVol = None  # type: ignore[assignment]
     active_tcp: bool = True
     active_rdma: bool = False
-    lvol_sync_del_queue: List[str] = []
 
     def rpc_client(self, **kwargs):
         """Return rpc client to this node
diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml
index ba0f8b61d..fd79f43c1 100644
--- a/simplyblock_core/scripts/docker-compose-swarm.yml
+++ b/simplyblock_core/scripts/docker-compose-swarm.yml
@@ -349,6 +349,20 @@ services:
     environment:
       SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL"
 
+  TasksRunnerLVolSyncDelete:
+    <<: *service-base
+    image: $SIMPLYBLOCK_DOCKER_IMAGE
+    command: "python simplyblock_core/services/tasks_runner_sync_lvol_del.py"
+    deploy:
+      placement:
+        constraints: [node.role == manager]
+    volumes:
+      - "/etc/foundationdb:/etc/foundationdb"
+    networks:
+      - hostnet
+    environment:
+      SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL"
+
 networks:
   monitoring-net:
     external: true
diff --git a/simplyblock_core/services/lvol_monitor.py b/simplyblock_core/services/lvol_monitor.py
index 884b67396..8486f3a32 100644
--- a/simplyblock_core/services/lvol_monitor.py
+++ b/simplyblock_core/services/lvol_monitor.py
@@ -132,8 +132,7 @@ def process_lvol_delete_finish(lvol):
         sec_node = db.get_storage_node_by_id(snode.get_id())
 
     if sec_node:
-        sec_node.lvol_sync_del_queue.append(f"{lvol.lvs_name}/{lvol.lvol_bdev}")
-        sec_node.write_to_db()
+        tasks_controller.add_lvol_sync_del_task(sec_node.cluster_id, sec_node.get_id(), f"{lvol.lvs_name}/{lvol.lvol_bdev}")
 
     lvol_events.lvol_delete(lvol)
     lvol.remove(db.kv_store)
@@ -349,19 +348,6 @@ def process_lvol_delete_try_again(lvol):
                     present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names)
                     set_snapshot_health_check(snap, present)
 
-                snode = db.get_storage_node_by_id(snode.get_id())
-                if snode.status == StorageNode.STATUS_ONLINE:
-                    not_deleted = []
-                    for bdev_name in snode.lvol_sync_del_queue:
-                        logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}")
-                        ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True)
-                        if not ret:
-                            if "code" in err and err["code"] == -19:
-                                logger.error(f"Sync delete completed with error: {err}")
-                            else:
-                                logger.error(f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}")
-                                not_deleted.append(bdev_name)
-                    snode.lvol_sync_del_queue = not_deleted
-                    snode.write_to_db()
+
 
     time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC)
diff --git a/simplyblock_core/services/snapshot_monitor.py b/simplyblock_core/services/snapshot_monitor.py
index c82476e7b..a99ed89f3 100644
--- a/simplyblock_core/services/snapshot_monitor.py
+++ b/simplyblock_core/services/snapshot_monitor.py
@@ -5,7 +5,7 @@
 
 from simplyblock_core import constants, db_controller, utils
 from simplyblock_core.models.cluster import Cluster
-from simplyblock_core.controllers import health_controller, snapshot_events
+from simplyblock_core.controllers import health_controller, snapshot_events, tasks_controller
 from simplyblock_core.models.snapshot import SnapShot
 from simplyblock_core.models.storage_node import StorageNode
 from simplyblock_core.rpc_client import RPCClient
@@ -76,8 +76,7 @@ def process_snap_delete_finish(snap, leader_node):
 
     non_leader = db.get_storage_node_by_id(non_leader_id)
     if non_leader:
-        non_leader.lvol_sync_del_queue.append(snap.snap_bdev)
-        non_leader.write_to_db()
+        tasks_controller.add_lvol_sync_del_task(non_leader.cluster_id, non_leader.get_id(), snap.snap_bdev)
 
     snapshot_events.snapshot_delete(snap)
     snap.remove(db.kv_store)
diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py
index a39de42ab..e95dbdf94 100644
--- a/simplyblock_core/services/tasks_runner_port_allow.py
+++ b/simplyblock_core/services/tasks_runner_port_allow.py
@@ -3,7 +3,7 @@
 
 
 from simplyblock_core import db_controller, utils, storage_node_ops, distr_controller
-from simplyblock_core.controllers import tcp_ports_events, health_controller
+from simplyblock_core.controllers import tcp_ports_events, health_controller, tasks_controller
 from simplyblock_core.fw_api_client import FirewallClient
 from simplyblock_core.models.job_schedule import JobSchedule
 from simplyblock_core.models.cluster import Cluster
@@ -196,19 +196,13 @@
                             task.status = JobSchedule.STATUS_RUNNING
                             task.write_to_db(db.kv_store)
 
-                        not_deleted = []
-                        for bdev_name in snode.lvol_sync_del_queue:
-                            logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}")
-                            ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True)
-                            if not ret:
-                                if "code" in err and err["code"] == -19:
-                                    logger.error(f"Sync delete completed with error: {err}")
-                                else:
-                                    logger.error(
-                                        f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}")
-                                    not_deleted.append(bdev_name)
-                        snode.lvol_sync_del_queue = not_deleted
-                        snode.write_to_db()
+                        # wait for lvol sync delete
+                        lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id)
+                        while lvol_sync_del_found:
+                            logger.info("Lvol sync delete task found, waiting")
+                            can_continue = False
+                            time.sleep(3)
+                            lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id)
 
                         if sec_node and sec_node.status == StorageNode.STATUS_ONLINE:
                             sec_rpc_client = sec_node.rpc_client()
diff --git a/simplyblock_core/services/tasks_runner_sync_lvol_del.py b/simplyblock_core/services/tasks_runner_sync_lvol_del.py
new file mode 100644
index 000000000..fbf0c1ee4
--- /dev/null
+++ b/simplyblock_core/services/tasks_runner_sync_lvol_del.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+import time
+
+
+from simplyblock_core import db_controller, utils
+from simplyblock_core.models.job_schedule import JobSchedule
+from simplyblock_core.models.cluster import Cluster
+from simplyblock_core.models.storage_node import StorageNode
+
+logger = utils.get_logger(__name__)
+
+# get DB controller
+db = db_controller.DBController()
+
+
+logger.info("Starting Tasks runner...")
+while True:
+
+    clusters = db.get_clusters()
+    if not clusters:
+        logger.error("No clusters found!")
+    else:
+        for cl in clusters:
+            if cl.status == Cluster.STATUS_IN_ACTIVATION:
+                continue
+
+            tasks = db.get_job_tasks(cl.get_id(), reverse=False)
+            for task in tasks:
+
+                if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL:
+                    if task.status != JobSchedule.STATUS_DONE:
+
+                        # get new task object because it could be changed from cancel task
+                        task = db.get_task_by_id(task.uuid)
+
+                        if task.canceled:
+                            task.function_result = "canceled"
+                            task.status = JobSchedule.STATUS_DONE
+                            task.write_to_db(db.kv_store)
+                            continue
+
+                        node = db.get_storage_node_by_id(task.node_id)
+
+                        if not node:
+                            task.function_result = "node not found"
+                            task.status = JobSchedule.STATUS_DONE
+                            task.write_to_db(db.kv_store)
+                            continue
+
+                        if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]:
+                            msg = f"Node is {node.status}, retry task"
+                            logger.info(msg)
+                            task.function_result = msg
+                            task.status = JobSchedule.STATUS_SUSPENDED
+                            task.write_to_db(db.kv_store)
+                            continue
+
+                        if task.status != JobSchedule.STATUS_RUNNING:
+                            task.status = JobSchedule.STATUS_RUNNING
+                            task.write_to_db(db.kv_store)
+
+                        lvol_bdev_name = task.function_params["lvol_bdev_name"]
+
+                        logger.info(f"Sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}")
+                        ret, err = node.rpc_client().delete_lvol(lvol_bdev_name, del_async=True)
+                        if not ret:
+                            if "code" in err and err["code"] == -19:
+                                logger.error(f"Sync delete completed with error: {err}")
+                            else:
+                                logger.error(
+                                    f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}")
+
+                        task.function_result = f"bdev {lvol_bdev_name} deleted"
+                        task.status = JobSchedule.STATUS_DONE
+                        task.write_to_db(db.kv_store)
+
+    time.sleep(3)

From cd68c603133ccef6709fc792acb52b648bdca009 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Thu, 13 Nov 2025 09:56:35 +0100
Subject: [PATCH 10/68] added fdb multi AZ support (#736)

* added fdb and storageclass multi AZ support

* use ipv4 for fdb controller manager healthcheck

* updated fdb controller manager resource name
---
 simplyblock_core/constants.py                 |  3 +-
 .../charts/templates/foundationdb.yaml        | 45 ++++++++++++-------
 .../charts/templates/storage_class.yaml       | 10 ++++-
 simplyblock_core/scripts/charts/values.yaml   |  8 +++-
 4 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py
index 41824c73a..d82275954 100644
--- a/simplyblock_core/constants.py
+++ b/simplyblock_core/constants.py
@@ -133,7 +133,8 @@ def get_config_var(name, default=None):
 LVOL_NVME_CONNECT_NR_IO_QUEUES=3
 LVOL_NVME_KEEP_ALIVE_TO=10
 LVOL_NVME_KEEP_ALIVE_TO_TCP=7
-LVOL_NVMF_PORT_START=int(os.getenv('LVOL_NVMF_PORT_START', 9100))
+LVOL_NVMF_PORT_ENV = os.getenv("LVOL_NVMF_PORT_START", "")
+LVOL_NVMF_PORT_START = int(LVOL_NVMF_PORT_ENV) if LVOL_NVMF_PORT_ENV else 9100
 QPAIR_COUNT=32
 CLIENT_QPAIR_COUNT=3
 NVME_TIMEOUT_US=8000000
diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml
index 1a3134e58..ddcdf9e92 100644
--- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml
+++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml
@@ -2,20 +2,20 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: controller-manager
+  name: simplyblock-fdb-controller-manager
   labels:
-    control-plane: controller-manager
-    app: controller-manager
+    control-plane: simplyblock-fdb-controller-manager
+    app: simplyblock-fdb-controller-manager
 spec:
   selector:
     matchLabels:
-      app: controller-manager
+      app: simplyblock-fdb-controller-manager
   replicas: 1
   template:
     metadata:
       labels:
-        control-plane: controller-manager
-        app: controller-manager
+        control-plane: simplyblock-fdb-controller-manager
+        app: simplyblock-fdb-controller-manager
     spec:
       securityContext:
         runAsUser: 4059
@@ -28,7 +28,7 @@ spec:
           emptyDir: {}
         - name: fdb-binaries
           emptyDir: {}
-      serviceAccountName: controller-manager
+      serviceAccountName: simplyblock-fdb-controller-manager
       initContainers:
         - name: foundationdb-kubernetes-init-7-3
           image: foundationdb/fdb-kubernetes-monitor:7.3.63
@@ -51,6 +51,8 @@ spec:
       containers:
         - command:
             - /manager
+          args:
+            - "--health-probe-bind-address=:9443"
           image: foundationdb/fdb-kubernetes-operator:v2.13.0
           name: manager
           env:
@@ -86,13 +88,13 @@ spec:
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: controller-manager
+  name: simplyblock-fdb-controller-manager
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
-  name: manager-role
+  name: simplyblock-fdb-manager-role
 rules:
 - apiGroups:
   - ""
@@ -164,7 +166,7 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   creationTimestamp: null
-  name: manager-clusterrole
+  name: simplyblock-fdb-manager-clusterrole
 rules:
 - apiGroups:
   - ""
@@ -179,27 +181,27 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
   creationTimestamp: null
-  name: manager-rolebinding
+  name: simplyblock-fdb-manager-rolebinding
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
-  name: manager-role
+  name: simplyblock-fdb-manager-role
 subjects:
 - kind: ServiceAccount
-  name: controller-manager
+  name: simplyblock-fdb-controller-manager
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
   creationTimestamp: null
-  name: manager-clusterrolebinding
+  name: simplyblock-fdb-manager-clusterrolebinding
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
-  name: manager-clusterrole
+  name: simplyblock-fdb-manager-clusterrole
 subjects:
 - kind: ServiceAccount
-  name: controller-manager
+  name: simplyblock-fdb-controller-manager
   namespace: metadata.namespace
 
 ##### cluster file #################
@@ -213,7 +215,11 @@ spec:
     replacements:
       enabled: true
   faultDomain:
+  {{- if .Values.foundationdb.multiAZ }}
+    key: topology.kubernetes.io/zone
+  {{- else }}
     key: foundationdb.org/none
+  {{- end }}
   imageType: split
   labels:
     filterOnOwnerReference: false
@@ -225,10 +231,17 @@ spec:
     - foundationdb.org/fdb-process-group-id
   minimumUptimeSecondsForBounce: 60
   processCounts:
+  {{- if .Values.foundationdb.multiAZ }}
+    cluster_controller: 1
+    log: 4
+    storage: 4
+    stateless: -1
+  {{- else }}
     cluster_controller: 1
     log: 3
     storage: 3
     stateless: -1
+  {{- end }}
   processes:
     general:
       customParameters:
diff --git a/simplyblock_core/scripts/charts/templates/storage_class.yaml b/simplyblock_core/scripts/charts/templates/storage_class.yaml
index 64e5e6280..9b6a2c9ce 100644
--- a/simplyblock_core/scripts/charts/templates/storage_class.yaml
+++ b/simplyblock_core/scripts/charts/templates/storage_class.yaml
@@ -7,4 +7,12 @@ provisioner: openebs.io/local
 allowVolumeExpansion: true
 reclaimPolicy: Retain
 volumeBindingMode: WaitForFirstConsumer
-  
+{{- if .Values.storageclass.allowedTopologyZones }}
+allowedTopologies:
+- matchLabelExpressions:
+  - key: topology.kubernetes.io/zone
+    values:
+{{- range .Values.storageclass.allowedTopologyZones }}
+    - {{ . }}
+{{- end }}
+{{- end }}
diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml
index 467734176..994e9a21f 100644
--- a/simplyblock_core/scripts/charts/values.yaml
+++ b/simplyblock_core/scripts/charts/values.yaml
@@ -24,10 +24,16 @@ image:
 
 ports:
   lvolNvmfPortStart:
-  
+
+storageclass:  
+  allowedTopologyZones: []
+
 openebs:
   enabled: true
 
+foundationdb:
+  multiAZ: false
+
 mongodb:
   name: "simplyblock-mongodb"
   deployment_name: "simplyblock-mongodb"

From 1c38b6eead3cd1fc0d59b6bd7255955016783ef8 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Fri, 14 Nov 2025 13:34:42 +0100
Subject: [PATCH 11/68] increased k8s fdb memory limit (#740)

---
 .../scripts/charts/templates/foundationdb.yaml            | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml
index ddcdf9e92..a3b2d8ccb 100644
--- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml
+++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml
@@ -298,10 +298,10 @@ spec:
             resources:
               limits:
                 cpu: 500m
-                memory: 2Gi
+                memory: 4Gi
               requests:
                 cpu: 100m
-                memory: 512Mi
+                memory: 1Gi
             securityContext:
               runAsUser: 0
           affinity:
@@ -321,10 +321,10 @@ spec:
             resources:
               limits:
                 cpu: 500m
-                memory: 2Gi
+                memory: 4Gi
               requests:
                 cpu: 100m
-                memory: 512Mi
+                memory: 1Gi
             securityContext:
               runAsUser: 0
           affinity:

From 5d9e0a47e330e5e5d0a3a7ccd980f09978c2535d Mon Sep 17 00:00:00 2001
From: noctarius aka Christoph Engelbert <me@noctarius.com>
Date: Fri, 14 Nov 2025 14:17:24 +0100
Subject: [PATCH 12/68] Added MIT License (#742)

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..37d1834ca
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023-2025 simplyblock GmbH
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

From ee8d4605b491af786457794d76b330e152a8074a Mon Sep 17 00:00:00 2001
From: schmidt-scaled <82834682+schmidt-scaled@users.noreply.github.com>
Date: Sat, 15 Nov 2025 12:28:16 +0300
Subject: [PATCH 13/68] Update constants.py (#744)

---
 simplyblock_core/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py
index d82275954..30902d839 100644
--- a/simplyblock_core/constants.py
+++ b/simplyblock_core/constants.py
@@ -225,4 +225,4 @@ def get_config_var(name, default=None):
 
 qos_class_meta_and_migration_weight_percent = 25
 
-MIG_PARALLEL_JOBS = 16
\ No newline at end of file
+MIG_PARALLEL_JOBS = 64

From f83145cf41d513c7bc5ac23ae8bc5459cd29c8cc Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Sat, 15 Nov 2025 12:35:06 +0300
Subject: [PATCH 14/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 .../controllers/lvol_migration_controller.py  | 359 ++++++------------
 simplyblock_core/models/lvol_migration.py     | 166 ++++++--
 simplyblock_core/rpc_client.py                |  54 +++
 3 files changed, 301 insertions(+), 278 deletions(-)

diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
index 71cb30426..9a4067943 100644
--- a/simplyblock_core/controllers/lvol_migration_controller.py
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -1,251 +1,122 @@
-# Ticket description for live lvol migration:
-# Live lvol migration moves lvols together with all related objects
-# (related snapshots, related clones) from one storage node to another
-# storage node in the same cluster. This happens online and very fast,
-# as no actual data is copied.
-#
-# It is NOT possible:
-# - to move snapshots or clones independently from the lvol
-# - to move namespace lvols belonging to the same subsystem independently
-#
-# We need to implement this feature in control plane in two steps:
-# a) move a specific lvol and its related objects based on the lvol name
-#    or uuid from one node to another node. The other node must be online
-#    and it must not be the secondary of the node the lvol is currently attached to.
-# b) create an automatism, which periodically controls the balance of
-#    performance and ram consumption across nodes and re-balances certain
-#    lvols if a node becomes over-loaded
-
 import asyncio
-from typing import Iterable
+from typing import Optional
 
 from ..models.lvol_migration import (
-    LvolMigration,
-    MigrationItem,
-    MigrationState,
-    StorageObject
+    MigrationObject, MigrationStream, Snapshot,
+    MigrationState, StreamState, ObjectMigrationState
 )
-from enum import Enum
-
-
-class ObjType(Enum):
-    SNAPSHOT = "snapshot"
-    CLONE = "clone"
-    LVOL = "lvol"
-
-
-class MigrationQueue(LvolMigration):
-    def add_object(self, storage_obj: StorageObject, obj_type: ObjType):
-        item = MigrationItem(storage=storage_obj, state=MigrationState.NEW)
-        item.type = obj_type
-        self.migrations.append(item)
-        return item
-
-    def iter_snapshots(self):
-        return (m for m in self.migrations if m.type == ObjType.SNAPSHOT)
-
-    def iter_clones(self):
-        return (m for m in self.migrations if m.type == ObjType.CLONE)
-
-    def iter_lvol(self):
-        return (m for m in self.migrations if m.type == ObjType.LVOL)
-
-
-# -------------------------------------------------------------------------
-# Async-capable Controller
-# -------------------------------------------------------------------------
-
-class LvolMigrationController:
-
-    # ---------------------------------------------------------------------
-    # Public entry point
-    # ---------------------------------------------------------------------
-
-    async def migrate_lvol(self, lvol) -> str:
-        mq = self.create_migration_queue(lvol)
-
-        if self.all_nodes_online():
-            self.freeze_snapshots_clones(lvol)
-
-            result = await self.process_migration_queue(mq)
-
-            if result != "DONE":
-                self.register_continue(mq)
-                return "SUSPENDED"
-
-            self.unfreeze_snapshots_clones(lvol)
-            return "DONE"
-
-        return "SUSPENDED"
-
-    # ---------------------------------------------------------------------
-
-    def create_migration_queue(self, lvol) -> MigrationQueue:
-        mq = MigrationQueue(
-            primary_source=lvol.primary,
-            secondary_source=lvol.secondary,
-            primary_target=lvol.target_primary,
-            secondary_target=lvol.target_secondary,
-        )
-
-        for s in lvol.get_snapshots():
-            mq.add_object(s, ObjType.SNAPSHOT)
-
-        for c in lvol.get_clones():
-            mq.add_object(c, ObjType.CLONE)
-
-        mq.add_object(lvol, ObjType.LVOL)
-        return mq
-
-    # ---------------------------------------------------------------------
-    # Core logic with asyncio
-    # ---------------------------------------------------------------------
-
-    async def process_migration_queue(self, mq: MigrationQueue) -> str:
-
-        if not await self._process_subset(mq, mq.iter_snapshots()):
-            return "CANCELED"
-
-        if not await self._process_subset(mq, mq.iter_clones()):
-            return "CANCELED"
-
-        if not await self._process_subset(mq, mq.iter_lvol()):
-            return "CANCELED"
-
-        return "DONE"
-
-    # ---------------------------------------------------------------------
-
-    async def _process_subset(self, mq: MigrationQueue, iterator: Iterable[MigrationItem]) -> bool:
-        tasks = []
-
-        for item in iterator:
-            if item.state in (MigrationState.NEW, MigrationState.IN_MIGRATION):
-                item.state = MigrationState.IN_MIGRATION
-                tasks.append(asyncio.create_task(self.migrate_object(item)))
-
-        if not tasks:
-            return True
 
-        results = await asyncio.gather(*tasks, return_exceptions=True)
-
-        if not self.all_nodes_online():
-            return False
-
-        # Check for errors
-        for r in results:
-            if isinstance(r, Exception):
-                return False
-            if getattr(r, "failed", False):
-                return False
-
-        # Mark items done
-        for item in iterator:
-            if item.state == MigrationState.IN_MIGRATION:
-                item.state = MigrationState.MIGRATED
-
-        return True
-
-    # ---------------------------------------------------------------------
-    # Cleanup
-    # ---------------------------------------------------------------------
-
-    async def cleanup_migration_queue(self, mq: MigrationQueue, lvol):
-        mq.status = "IN_DELETION"
-
-        tasks = []
-        for item in mq.migrations:
-            if item.state != MigrationState.NEW:
-                tasks.append(asyncio.create_task(
-                    self.async_delete(item.storage, mq.primary_target)
-                ))
-                tasks.append(asyncio.create_task(
-                    self.register_syn_delete(item.storage, mq.secondary_target)
-                ))
-
-        if tasks:
-            await asyncio.gather(*tasks, return_exceptions=True)
-
-        mq.status = "DELETED"
-        self.unfreeze_snapshots_clones(lvol)
-
-    # ---------------------------------------------------------------------
-    # Individual migration operations (async)
-    # ---------------------------------------------------------------------
-
-    async def migrate_object(self, item: MigrationItem):
-        src = item.storage.source_node
-        dst = item.storage.target_node
-
-        await self.create_target_namespace(dst, item.storage)
-        await self.connect_source_to_target(src, dst, item.storage)
-
-        if item.type == ObjType.SNAPSHOT:
-            return await self._migrate_snapshot(item)
-        elif item.type == ObjType.CLONE:
-            return await self._migrate_clone(item)
+# ---------------------------------------------------------------------------
+# CONTROLLER
+# ---------------------------------------------------------------------------
+
+class MigrationController:
+
+    def __init__(self, migration: MigrationObject):
+        self.migration = migration
+        if self.migration.completion_poll_queue is None:
+            self.migration.completion_poll_queue = asyncio.Queue()
+
+    # -----------------------------------------------------------------------
+    # START MIGRATION
+    # -----------------------------------------------------------------------
+    async def migrate_start(self):
+        """Entry point: prepare snapshots and streams, start migration."""
+        self.migration.status = MigrationState.PREPARING
+
+        # 1. Check all nodes online (mocked)
+        if not self._nodes_online():
+            self.migration.status = MigrationState.SUSPENDED
+            return
+
+        # 2. Build streams for all logical volumes
+        await self.migrate_prepare()
+
+        self.migration.status = MigrationState.RUNNING
+        await self.migration_iterate_streams()
+
+    async def migrate_prepare(self):
+        """Prepare each logical volume: build streams and snapshot references."""
+        for lv in self.migration.logical_volumes:
+            stream = MigrationStream(
+                lvol_name=lv.name,
+                lvol_state=ObjectMigrationState.NEW,
+                lvol_namespace=lv.namespace_uuid
+            )
+            # Link snapshots if any exist for this LV
+            for snapshot in self.migration.snapshots:
+                if snapshot.name.startswith(lv.name):  # simple match; customize
+                    stream.append_snapshot(snapshot)
+            self.migration.streams.append(stream)
+
+    # -----------------------------------------------------------------------
+    # ITERATE STREAMS
+    # -----------------------------------------------------------------------
+    async def migration_iterate_streams(self):
+        """Iterate over all streams sequentially."""
+        for stream in self.migration.streams:
+            if stream.status not in {StreamState.DONE, StreamState.FAILED}:
+                await self.migrate_stream_start(stream)
+
+        # If all streams done, mark migration done
+        if all(s.status == StreamState.DONE for s in self.migration.streams):
+            self.migration.status = MigrationState.DONE
+
+    # -----------------------------------------------------------------------
+    # STREAM OPERATIONS
+    # -----------------------------------------------------------------------
+    async def migrate_stream_start(self, stream: MigrationStream):
+        """Start migration for a stream."""
+        stream.status = StreamState.RUNNING
+
+        # Iterate snapshots in the stream
+        current = stream.head_snapshot_ref
+        while current:
+            snapshot = current.snapshot
+            if snapshot.status == ObjectMigrationState.NEW:
+                await spdk_set_migration_flag(snapshot.name)
+                await spdk_transfer_snapshot(snapshot.name, stream.lvol_name)
+                snapshot.status = ObjectMigrationState.DONE
+                # Add to completion poll queue
+                await self.migration.completion_poll_queue.put(snapshot.name)
+            current = current.next
+
+        # Once snapshots done, migrate the main LV
+        await self.migrate_stream_resume(stream)
+
+    async def migrate_stream_resume(self, stream: MigrationStream):
+        """Handle LV migration after snapshots."""
+        if stream.lvol_state == ObjectMigrationState.NEW:
+            await spdk_final_lvol_migration(stream.lvol_name)
+            stream.lvol_state = ObjectMigrationState.DONE
+            stream.status = StreamState.DONE
+
+        # Clean up intermediate resources
+        await self.migrate_stream_cleanup(stream)
+
+    async def migrate_stream_cleanup(self, stream: MigrationStream):
+        """Cleanup temporary namespaces, NQNs, etc."""
+        # Placeholder: remove temp subsystems or namespaces
+        await asyncio.sleep(0.01)
+        # No additional state changes needed for this skeleton
+
+    # -----------------------------------------------------------------------
+    # MIGRATION CLEANUP
+    # -----------------------------------------------------------------------
+    async def migrate_cleanup(self, failed: bool = False):
+        """Global migration cleanup."""
+        if failed:
+            self.migration.status = MigrationState.FAILED
+            # Mark streams failed if not done
+            for stream in self.migration.streams:
+                if stream.status != StreamState.DONE:
+                    stream.status = StreamState.FAILED
         else:
-            return await self._migrate_lvol(item)
-
-    # ---------------------------------------------------------------------
-    # Snapshot migration with retries
-    # ---------------------------------------------------------------------
-
-    async def _migrate_snapshot(self, item):
-        for attempt in range(5):
-            result = await self.run_migration_rpc(item)
-            if result.success:
-                return result
+            self.migration.status = MigrationState.DONE
 
-            if not self.all_nodes_online():
-                break
-
-            await asyncio.sleep(self.retry_delay(attempt))
-
-        return result
-
-    async def _migrate_clone(self, item):
-        return await self.run_migration_rpc(item)
-
-    async def _migrate_lvol(self, item):
-        return await self.run_migration_rpc(item)
-
-    # ---------------------------------------------------------------------
-    # Placeholder hooks (inject actual system implementation)
-    # ---------------------------------------------------------------------
-
-    async def create_target_namespace(self, dst, storage):
-        pass
-
-    async def connect_source_to_target(self, src, dst, storage):
-        pass
-
-    async def run_migration_rpc(self, item):
-        """
-        Must return an object with fields:
-            .success  -> bool
-            .failed   -> bool
-        or raise exception.
-        """
-        pass
-
-    async def async_delete(self, storage, target):
-        pass
-
-    async def register_syn_delete(self, storage, target):
-        pass
-
-    def freeze_snapshots_clones(self, lvol):
-        pass
-
-    def unfreeze_snapshots_clones(self, lvol):
-        pass
-
-    def all_nodes_online(self) -> bool:
-        pass
-
-    def register_continue(self, mq):
-        pass
-
-    def retry_delay(self, attempt: int) -> float:
-        return min(2 ** attempt, 60)   # exponential backoff
+    # -----------------------------------------------------------------------
+    # HELPER FUNCTIONS
+    # -----------------------------------------------------------------------
+    def _nodes_online(self) -> bool:
+        """Mock node health check."""
+        return True
diff --git a/simplyblock_core/models/lvol_migration.py b/simplyblock_core/models/lvol_migration.py
index 9cfa04d04..5d9a51c5e 100644
--- a/simplyblock_core/models/lvol_migration.py
+++ b/simplyblock_core/models/lvol_migration.py
@@ -1,48 +1,146 @@
-from enum import Enum
+from __future__ import annotations
 from dataclasses import dataclass, field
+from enum import Enum
 from typing import List, Optional
+import uuid
+import asyncio
+
+
+# ---------------------------------------------------------------------------
+# ENUMS
+# ---------------------------------------------------------------------------
 
-from simplyblock_core.models.lvol_model import LVol
-from simplyblock_core.models.snapshot import SnapShot
-from simplyblock_core.models.storage_node import StorageNode
+class MigrationState(str, Enum):
+    NEW = "new"
+    PREPARING = "preparing"
+    RUNNING = "running"
+    SUSPENDED = "suspended"
+    FAILED = "failed"
+    PARTIALLY_FAILED = "partially_failed"
+    DONE = "done"
+
+
+class StreamState(str, Enum):
+    NEW = "new"
+    RUNNING = "running"
+    SUSPENDED = "suspended"
+    FAILED = "failed"
+    CLEANUP = "cleanup"
+    DONE = "done"
 
 
-class MigrationState(Enum):
+class ObjectMigrationState(str, Enum):
     NEW = "new"
-    IN_MIGRATION = "in-migration"
-    MIGRATED = "migrated"
+    RUNNING = "running"
+    SUSPENDED = "suspended"
+    CANCELED = "canceled"
+    DONE = "done"
 
 
+# ---------------------------------------------------------------------------
+# DATA MODELS
+# ---------------------------------------------------------------------------
+
 @dataclass
-class StorageObject:
-    """Represents an lvol/clone/snapshot or similar."""
-    id: str
-    lvol_ref: LVol
-    snap_ref: SnapShot
-    type: str  # e.g., "lvol", "clone", "snapshot"
+class LogicalVolumeRef:
+    """Reference to a logical volume participating in a migration."""
+    name: str  # "LVS/LV"
+    namespace_uuid: str
+    crypto_bdev_name: Optional[str] = None
 
 
 @dataclass
-class MigrationItem:
-    """A single storage-object migration entry."""
-    storage: StorageObject
-    state: MigrationState = MigrationState.NEW
+class Snapshot:
+    """
+    Global snapshot object, exists only once.
+    Stores all per-snapshot migration metadata.
+    """
+    name: str  # "LVS/LV"
+    source_uuid: Optional[str] = None
+    target_uuid: Optional[str] = None
+
+    # Migration metadata
+    temporary_nqn: Optional[str] = None
+    temporary_namespace: Optional[str] = None
+    mapid: Optional[str] = None
+    status: ObjectMigrationState = ObjectMigrationState.NEW
+
 
 @dataclass
-class LvolMigration:
-    """Model representing a full logical-volume migration plan."""
-    primary_source: StorageNode
-    secondary_source: StorageNode
-    primary_target: StorageNode
-    secondary_target: StorageNode
-    migrations: List[MigrationItem] = field(default_factory=list)
-
-    def add_migration(self, storage: StorageObject) -> None:
-        self.migrations.append(MigrationItem(storage))
-
-    def update_state(self, storage_id: str, new_state: MigrationState) -> None:
-        for item in self.migrations:
-            if item.storage.id == storage_id:
-                item.state = new_state
-                return
-        raise ValueError(f"No migration item with storage id={storage_id}")
+class SnapshotRef:
+    """Per-stream linked list node referencing a global snapshot."""
+    snapshot: Snapshot
+    next: Optional["SnapshotRef"] = None
+
+
+@dataclass
+class MigrationStream:
+    """
+    Each migration stream corresponds to one logical volume.
+    Contains a linked list of snapshot references.
+    Tracks only LV migration state and metadata.
+    """
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    status: StreamState = StreamState.NEW
+
+    # Logical volume info and per-LV migration metadata
+    lvol_name: Optional[str] = None
+    lvol_state: ObjectMigrationState = ObjectMigrationState.NEW
+    lvol_namespace: Optional[str] = None
+    lvol_nqn: Optional[str] = None
+    lvol_source_uuid: Optional[str] = None
+    lvol_target_uuid: Optional[str] = None
+
+    # Linked list of snapshot references (per-stream)
+    head_snapshot_ref: Optional[SnapshotRef] = None
+
+    def append_snapshot(self, snapshot: Snapshot):
+        """Append a snapshot reference to the stream linked list."""
+        ref = SnapshotRef(snapshot=snapshot)
+        if not self.head_snapshot_ref:
+            self.head_snapshot_ref = ref
+            return ref
+        cur = self.head_snapshot_ref
+        while cur.next:
+            cur = cur.next
+        cur.next = ref
+        return ref
+
+    def list_snapshot_names(self) -> List[str]:
+        """Return list of snapshot names in this stream."""
+        names = []
+        cur = self.head_snapshot_ref
+        while cur:
+            names.append(cur.snapshot.name)
+            cur = cur.next
+        return names
+
+
+@dataclass
+class MigrationObject:
+    """
+    Full migration object, containing multiple streams and logical volumes.
+    Snapshots exist independently and are referenced by streams.
+    """
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    status: MigrationState = MigrationState.NEW
+
+    primary_source: Optional[str] = None
+    secondary_source: Optional[str] = None
+    primary_target: Optional[str] = None
+    secondary_target: Optional[str] = None
+
+    logical_volumes: List[LogicalVolumeRef] = field(default_factory=list)
+
+    # Top-level subsystem NQN (if any)
+    nqn: Optional[str] = None
+
+    streams: List[MigrationStream] = field(default_factory=list)
+
+    # Global snapshot objects (shared across streams)
+    snapshots: List[Snapshot] = field(default_factory=list)
+
+    # Async queue for polling migration completion (set externally)
+    completion_poll_queue: Optional[asyncio.Queue] = None
+
+
diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py
index 66ef478f8..75ab0c3d0 100644
--- a/simplyblock_core/rpc_client.py
+++ b/simplyblock_core/rpc_client.py
@@ -1229,3 +1229,57 @@ def nvmf_port_unblock_rdma(self, port):
 
     def nvmf_get_blocked_ports_rdma(self):
         return self._request("nvmf_get_blocked_ports")
+
+    def lvol_final_migration(
+            self,
+            lvol_name: str,
+            lvol_id: str,
+            snapshot_name: str,
+            batch: int,
+            nqn: str
+    ):
+        params = {
+            "lvol_name": lvol_name,
+            "lvol_id": lvol_id,
+            "snapshot_name": snapshot_name,
+            "b": batch,
+            "g": nqn
+        }
+        return self._request("bdev_lvol_final_migration", params)
+
+    def lvol_set_migration_flag(self, lvol_name: str):
+        params = {
+            "lvol_name": lvol_name
+        }
+        return self._request("bdev_lvol_set_migration_flag", params)
+
+    def lvol_convert(self, lvol_name: str):
+        params = {
+            "lvol_name": lvol_name
+        }
+        return self._request("bdev_lvol_convert", params)
+
+    def lvol_add_clone(self, clone_name: str, source_lvol_name: str):
+        params = {
+            "clone_name": clone_name,
+            "source_lvol_name": source_lvol_name
+        }
+        return self._request("bdev_lvol_add_clone", params)
+
+
+def lvol_transfer(
+        self,
+        lvol_name: str,
+        offset: int,
+        batch: int,
+        nqn: str,
+        O: str
+):
+    params = {
+        "n": lvol_name,
+        "o": offset,
+        "b": batch,
+        "g": nqn,
+        "O": O
+    }
+    return self._request("bdev_lvol_transfer", params)
\ No newline at end of file

From 2b144912434668ce3502e5386af75828f997460a Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Sat, 15 Nov 2025 12:50:21 +0300
Subject: [PATCH 15/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 simplyblock_core/constants.py                               | 2 +-
 simplyblock_core/rpc_client.py                              | 4 ++--
 simplyblock_core/services/tasks_runner_failed_migration.py  | 2 +-
 simplyblock_core/services/tasks_runner_migration.py         | 2 +-
 simplyblock_core/services/tasks_runner_new_dev_migration.py | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py
index d82275954..36ba14a9e 100644
--- a/simplyblock_core/constants.py
+++ b/simplyblock_core/constants.py
@@ -225,4 +225,4 @@ def get_config_var(name, default=None):
 
 qos_class_meta_and_migration_weight_percent = 25
 
-MIG_PARALLEL_JOBS = 16
\ No newline at end of file
+MIG_PARALLEL_JOBS = 64
\ No newline at end of file
diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py
index 66ef478f8..ce48e1796 100644
--- a/simplyblock_core/rpc_client.py
+++ b/simplyblock_core/rpc_client.py
@@ -922,7 +922,7 @@ def distr_migration_status(self, name):
         params = {"name": name}
         return self._request("distr_migration_status", params)
 
-    def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=1024, jobs=4):
+    def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=64, jobs=64):
         params = {
             "name": name,
             "storage_ID": storage_ID,
@@ -935,7 +935,7 @@ def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=Fals
             params["jobs"] = jobs
         return self._request("distr_migration_failure_start", params)
 
-    def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=1024, jobs=4):
+    def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=64, jobs=64):
         params = {
             "name": name,
         }
diff --git a/simplyblock_core/services/tasks_runner_failed_migration.py b/simplyblock_core/services/tasks_runner_failed_migration.py
index fce4fd8ef..7d0b3e89f 100644
--- a/simplyblock_core/services/tasks_runner_failed_migration.py
+++ b/simplyblock_core/services/tasks_runner_failed_migration.py
@@ -88,7 +88,7 @@ def task_runner(task):
         if db.get_cluster_by_id(snode.cluster_id).is_qos_set():
             qos_high_priority = True
         rsp = rpc_client.distr_migration_failure_start(
-            distr_name, device.cluster_device_order, qos_high_priority, job_size=1024, jobs=constants.MIG_PARALLEL_JOBS)
+            distr_name, device.cluster_device_order, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS)
         if not rsp:
             logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}")
             task.function_result = "Failed to start device migration task"
diff --git a/simplyblock_core/services/tasks_runner_migration.py b/simplyblock_core/services/tasks_runner_migration.py
index fb085e4aa..e325e3d7e 100644
--- a/simplyblock_core/services/tasks_runner_migration.py
+++ b/simplyblock_core/services/tasks_runner_migration.py
@@ -93,7 +93,7 @@ def task_runner(task):
         qos_high_priority = False
         if db.get_cluster_by_id(snode.cluster_id).is_qos_set():
             qos_high_priority = True
-        rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024,
+        rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64,
                                                          jobs=constants.MIG_PARALLEL_JOBS)
         if not rsp:
             logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}")
diff --git a/simplyblock_core/services/tasks_runner_new_dev_migration.py b/simplyblock_core/services/tasks_runner_new_dev_migration.py
index f62a7f210..9feec7a56 100644
--- a/simplyblock_core/services/tasks_runner_new_dev_migration.py
+++ b/simplyblock_core/services/tasks_runner_new_dev_migration.py
@@ -98,7 +98,7 @@ def task_runner(task):
         qos_high_priority = False
         if db.get_cluster_by_id(snode.cluster_id).is_qos_set():
             qos_high_priority = True
-        rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024,
+        rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64,
                                                          jobs=constants.MIG_PARALLEL_JOBS)
         if not rsp:
             logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}")

From 314c4cfe60cfaf11c3dafb8e856f94bd17940878 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Mon, 17 Nov 2025 11:21:31 +0100
Subject: [PATCH 16/68] Update sc name (#746)

* migrated to k8s csi hostpath

* added more permission

* added more permission

* updated talos docs
---
 docs/talos.md                                 |  14 --
 simplyblock_core/scripts/charts/Chart.yaml    |   5 -
 .../templates/csi-hostpath-controller.yaml    | 217 ++++++++++++++++++
 .../templates/csi-hostpath-driverinfo.yaml    |  24 ++
 .../charts/templates/csi-hostpath-node.yaml   | 163 +++++++++++++
 .../charts/templates/foundationdb.yaml        |   2 +-
 .../scripts/charts/templates/mongodb.yaml     |   4 +-
 .../charts/templates/storage_class.yaml       |   9 +-
 .../scripts/charts/values-template.yaml       | 194 ----------------
 simplyblock_core/scripts/charts/values.yaml   |   7 +-
 10 files changed, 416 insertions(+), 223 deletions(-)
 create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml
 create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
 create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml
 delete mode 100644 simplyblock_core/scripts/charts/values-template.yaml

diff --git a/docs/talos.md b/docs/talos.md
index 47ff817d5..f1406ef38 100644
--- a/docs/talos.md
+++ b/docs/talos.md
@@ -19,26 +19,12 @@ kubectl label namespace simplyblock \
   --overwrite
 ```
 
-
-Patch the host machine so that OpenEBS could work
-
 Create a machine config patch with the contents below and save as patch.yaml
 ```
 cat > patch.yaml <<'EOF'
 machine:
   sysctls:
     vm.nr_hugepages: "1024"
-  nodeLabels:
-    openebs.io/engine: mayastor
-  kubelet:
-    extraMounts:
-      - destination: /var/openebs/local
-        type: bind
-        source: /var/openebs/local
-        options:
-          - rbind
-          - rshared
-          - rw
 EOF
 
 talosctl -e <endpoint ip/hostname> -n <node ip/hostname> patch mc -p @patch.yaml
diff --git a/simplyblock_core/scripts/charts/Chart.yaml b/simplyblock_core/scripts/charts/Chart.yaml
index 9d1b62643..380f67bcd 100644
--- a/simplyblock_core/scripts/charts/Chart.yaml
+++ b/simplyblock_core/scripts/charts/Chart.yaml
@@ -26,11 +26,6 @@ dependencies:
     version: "25.18.0"
     repository: "https://prometheus-community.github.io/helm-charts"
     condition: monitoring.enabled
-  - name: openebs
-    version: 3.9.0 
-    repository: https://openebs.github.io/charts
-    alias: openebs
-    condition: openebs.enabled
   - name: ingress-nginx
     version: 4.10.1
     repository: "https://kubernetes.github.io/ingress-nginx"
diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml
new file mode 100644
index 000000000..153c29bda
--- /dev/null
+++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml
@@ -0,0 +1,217 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: csi-hostpathplugin-sa
+  namespace: {{ .Release.Namespace }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: csi-hostpathplugin
+rules:
+  - apiGroups: [""]
+    resources: ["persistentvolumes"]
+    verbs: ["get", "list", "watch", "create", "delete", "update", "patch"]
+  - apiGroups: [""]
+    resources: ["persistentvolumeclaims"]
+    verbs: ["get", "list", "watch", "update"]
+  - apiGroups: [""]
+    resources: ["persistentvolumeclaims/status"]
+    verbs: ["get", "update", "patch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["volumeattachments"]
+    verbs: ["get", "list", "watch", "update"]
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["csinodes"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["storageclasses"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["csistoragecapacities"]
+    verbs: ["get", "list", "watch", "create", "update", "delete"]
+  - apiGroups: [""]
+    resources: ["events"]
+    verbs: ["create", "patch", "update", "get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: csi-hostpathplugin
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: csi-hostpathplugin
+subjects:
+  - kind: ServiceAccount
+    name: csi-hostpathplugin-sa
+    namespace: {{ .Release.Namespace }}
+---
+kind: StatefulSet
+apiVersion: apps/v1
+metadata:
+  name: csi-hostpathplugin
+  labels:
+    app.kubernetes.io/instance: hostpath.csi.k8s.io
+    app.kubernetes.io/part-of: csi-driver-host-path
+    app.kubernetes.io/name: csi-hostpathplugin
+    app.kubernetes.io/component: plugin
+spec:
+  serviceName: "csi-hostpathplugin"
+  # One replica only:
+  # Host path driver only works when everything runs
+  # on a single node.
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/instance: hostpath.csi.k8s.io
+      app.kubernetes.io/part-of: csi-driver-host-path
+      app.kubernetes.io/name: csi-hostpathplugin
+      app.kubernetes.io/component: plugin
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/instance: hostpath.csi.k8s.io
+        app.kubernetes.io/part-of: csi-driver-host-path
+        app.kubernetes.io/name: csi-hostpathplugin
+        app.kubernetes.io/component: plugin
+    spec:
+      serviceAccountName: csi-hostpathplugin-sa
+      containers:
+        - name: hostpath
+          image: registry.k8s.io/sig-storage/hostpathplugin:v1.17.0
+          args:
+            - "--drivername=hostpath.csi.k8s.io"
+            - "--v=5"
+            - "--endpoint=$(CSI_ENDPOINT)"
+            - "--nodeid=$(KUBE_NODE_NAME)"
+            # end hostpath args
+          env:
+            - name: CSI_ENDPOINT
+              value: unix:///csi/csi.sock
+            - name: KUBE_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: spec.nodeName
+          securityContext:
+            privileged: true
+          ports:
+          - containerPort: 9898
+            name: healthz
+            protocol: TCP
+          livenessProbe:
+            failureThreshold: 5
+            httpGet:
+              path: /healthz
+              port: healthz
+            initialDelaySeconds: 10
+            timeoutSeconds: 3
+            periodSeconds: 2
+          volumeMounts:
+            - mountPath: /csi
+              name: socket-dir
+            - mountPath: /var/lib/kubelet/pods
+              mountPropagation: Bidirectional
+              name: mountpoint-dir
+            - mountPath: /var/lib/kubelet/plugins
+              mountPropagation: Bidirectional
+              name: plugins-dir
+            - mountPath: /csi-data-dir
+              name: csi-data-dir
+            - mountPath: /dev
+              name: dev-dir
+
+        - name: liveness-probe
+          volumeMounts:
+          - mountPath: /csi
+            name: socket-dir
+          image: registry.k8s.io/sig-storage/livenessprobe:v2.17.0
+          args:
+          - --csi-address=/csi/csi.sock
+          - --health-port=9898
+
+        - name: csi-provisioner
+          image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0
+          args:
+            - -v=5
+            - --csi-address=/csi/csi.sock
+            - --feature-gates=Topology=true
+            - --enable-capacity
+            - --capacity-ownerref-level=0 # pod is owner
+            - --node-deployment=true
+            - --strict-topology=true
+            - --immediate-topology=false
+            - --worker-threads=5
+          env:
+          - name: NODE_NAME
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: spec.nodeName
+          - name: NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+            # end csi-provisioner args
+          securityContext:
+            # This is necessary only for systems with SELinux, where
+            # non-privileged sidecar containers cannot access unix domain socket
+            # created by privileged CSI driver container.
+            privileged: true
+          volumeMounts:
+            - mountPath: /csi
+              name: socket-dir
+
+        - name: csi-resizer
+          image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0
+          args:
+            - -v=5
+            - -csi-address=/csi/csi.sock
+          securityContext:
+            # This is necessary only for systems with SELinux, where
+            # non-privileged sidecar containers cannot access unix domain socket
+            # created by privileged CSI driver container.
+            privileged: true
+          volumeMounts:
+            - mountPath: /csi
+              name: socket-dir
+
+      volumes:
+        - hostPath:
+            path: /var/lib/kubelet/plugins/csi-hostpath
+            type: DirectoryOrCreate
+          name: socket-dir
+        - hostPath:
+            path: /var/lib/kubelet/pods
+            type: DirectoryOrCreate
+          name: mountpoint-dir
+        - hostPath:
+            path: /var/lib/kubelet/plugins_registry
+            type: Directory
+          name: registration-dir
+        - hostPath:
+            path: /var/lib/kubelet/plugins
+            type: Directory
+          name: plugins-dir
+        - hostPath:
+            # 'path' is where PV data is persisted on host.
+            # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot
+            path: /var/lib/csi-hostpath-data/
+            type: DirectoryOrCreate
+          name: csi-data-dir
+        - hostPath:
+            path: /dev
+            type: Directory
+          name: dev-dir
+        # end csi volumes
diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
new file mode 100644
index 000000000..c02431500
--- /dev/null
+++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
@@ -0,0 +1,24 @@
+apiVersion: storage.k8s.io/v1
+kind: CSIDriver
+metadata:
+  name: hostpath.csi.k8s.io
+  labels:
+    app.kubernetes.io/instance: hostpath.csi.k8s.io
+    app.kubernetes.io/part-of: csi-driver-host-path
+    app.kubernetes.io/name: hostpath.csi.k8s.io
+    app.kubernetes.io/component: csi-driver
+spec:
+  # Supports persistent and ephemeral inline volumes.
+  volumeLifecycleModes:
+  - Persistent
+  - Ephemeral
+  # To determine at runtime which mode a volume uses, pod info and its
+  # "csi.storage.k8s.io/ephemeral" entry are needed.
+  podInfoOnMount: true
+  # No attacher needed.
+  attachRequired: false
+  storageCapacity: true
+  # Kubernetes may use fsGroup to change permissions and ownership 
+  # of the volume to match user requested fsGroup in the pod's SecurityPolicy
+  fsGroupPolicy: File
+  
\ No newline at end of file
diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml
new file mode 100644
index 000000000..07e08f36e
--- /dev/null
+++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml
@@ -0,0 +1,163 @@
+
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: csi-hostpath-node-sa
+  namespace: {{ .Release.Namespace }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: csi-hostpath-node
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["volumeattachments"]
+    verbs: ["get", "list", "watch", "update"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: csi-hostpath-node
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: csi-hostpath-node
+subjects:
+  - kind: ServiceAccount
+    name: csi-hostpath-node-sa
+    namespace: {{ .Release.Namespace }}
+---
+kind: DaemonSet
+apiVersion: apps/v1
+metadata:
+  name: csi-hostpathplugin
+  labels:
+    app.kubernetes.io/instance: hostpath.csi.k8s.io
+    app.kubernetes.io/part-of: csi-driver-host-path
+    app.kubernetes.io/name: csi-hostpathplugin
+    app.kubernetes.io/component: plugin
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/instance: hostpath.csi.k8s.io
+      app.kubernetes.io/part-of: csi-driver-host-path
+      app.kubernetes.io/name: csi-hostpathplugin
+      app.kubernetes.io/component: plugin
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/instance: hostpath.csi.k8s.io
+        app.kubernetes.io/part-of: csi-driver-host-path
+        app.kubernetes.io/name: csi-hostpathplugin
+        app.kubernetes.io/component: plugin
+    spec:
+      serviceAccountName: csi-hostpath-node-sa
+      containers:
+        - name: node-driver-registrar
+          image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0
+          args:
+            - --v=5
+            - --csi-address=/csi/csi.sock
+            - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock
+          securityContext:
+            # This is necessary only for systems with SELinux, where
+            # non-privileged sidecar containers cannot access unix domain socket
+            # created by privileged CSI driver container.
+            privileged: true
+          env:
+            - name: KUBE_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: spec.nodeName
+          volumeMounts:
+          - mountPath: /csi
+            name: socket-dir
+          - mountPath: /registration
+            name: registration-dir
+          - mountPath: /csi-data-dir
+            name: csi-data-dir
+
+        - name: hostpath
+          image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0
+          args:
+            - --drivername=hostpath.csi.k8s.io
+            - --v=5
+            - --endpoint=$(CSI_ENDPOINT)
+            - --nodeid=$(KUBE_NODE_NAME)
+            - --capacity=slow=10Gi
+            - --capacity=fast=100Gi
+          env:
+            - name: CSI_ENDPOINT
+              value: unix:///csi/csi.sock
+            - name: KUBE_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: spec.nodeName
+          securityContext:
+            privileged: true
+          ports:
+          - containerPort: 9898
+            name: healthz
+            protocol: TCP
+          livenessProbe:
+            failureThreshold: 5
+            httpGet:
+              path: /healthz
+              port: healthz
+            initialDelaySeconds: 10
+            timeoutSeconds: 3
+            periodSeconds: 2
+          volumeMounts:
+            - mountPath: /csi
+              name: socket-dir
+            - mountPath: /var/lib/kubelet/pods
+              mountPropagation: Bidirectional
+              name: mountpoint-dir
+            - mountPath: /var/lib/kubelet/plugins
+              mountPropagation: Bidirectional
+              name: plugins-dir
+            - mountPath: /csi-data-dir
+              name: csi-data-dir
+            - mountPath: /dev
+              name: dev-dir
+        - name: liveness-probe
+          volumeMounts:
+          - mountPath: /csi
+            name: socket-dir
+          image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0
+          args:
+          - --csi-address=/csi/csi.sock
+          - --health-port=9898
+
+      volumes:
+        - hostPath:
+            path: /var/lib/kubelet/plugins/csi-hostpath
+            type: DirectoryOrCreate
+          name: socket-dir
+        - hostPath:
+            path: /var/lib/kubelet/pods
+            type: DirectoryOrCreate
+          name: mountpoint-dir
+        - hostPath:
+            path: /var/lib/kubelet/plugins_registry
+            type: Directory
+          name: registration-dir
+        - hostPath:
+            path: /var/lib/kubelet/plugins
+            type: Directory
+          name: plugins-dir
+        - hostPath:
+            # 'path' is where PV data is persisted on host.
+            # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot
+            path: /var/lib/csi-hostpath-data/
+            type: DirectoryOrCreate
+          name: csi-data-dir
+        - hostPath:
+            path: /dev
+            type: Directory
+          name: dev-dir
diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml
index a3b2d8ccb..4eb7f1410 100644
--- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml
+++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml
@@ -283,7 +283,7 @@ spec:
               runAsUser: 0
       volumeClaimTemplate:
         spec:
-          storageClassName: openebs-local-hostpath
+          storageClassName: local-hostpath
           accessModes:
             - ReadWriteOnce
           resources:
diff --git a/simplyblock_core/scripts/charts/templates/mongodb.yaml b/simplyblock_core/scripts/charts/templates/mongodb.yaml
index 740dd7642..815df6505 100644
--- a/simplyblock_core/scripts/charts/templates/mongodb.yaml
+++ b/simplyblock_core/scripts/charts/templates/mongodb.yaml
@@ -14,7 +14,7 @@ spec:
             name: data-volume
           spec:
             accessModes: [ "ReadWriteOnce" ]
-            storageClassName: openebs-local-hostpath
+            storageClassName: local-hostpath
             resources:
               requests:
                 storage: 5Gi
@@ -22,7 +22,7 @@ spec:
             name: logs-volume
           spec:
             accessModes: [ "ReadWriteOnce" ]
-            storageClassName: openebs-local-hostpath
+            storageClassName: local-hostpath
             resources:
               requests:
                 storage: 5Gi
diff --git a/simplyblock_core/scripts/charts/templates/storage_class.yaml b/simplyblock_core/scripts/charts/templates/storage_class.yaml
index 9b6a2c9ce..b23cb4a07 100644
--- a/simplyblock_core/scripts/charts/templates/storage_class.yaml
+++ b/simplyblock_core/scripts/charts/templates/storage_class.yaml
@@ -2,8 +2,13 @@
 apiVersion: storage.k8s.io/v1
 kind: StorageClass
 metadata:
-  name: openebs-local-hostpath
-provisioner: openebs.io/local
+  name: local-hostpath
+  labels:
+    app.kubernetes.io/instance: hostpath.csi.k8s.io
+    app.kubernetes.io/part-of: csi-driver-host-path
+    app.kubernetes.io/name: csi-hostpath-fast
+    app.kubernetes.io/component: storageclass
+provisioner: hostpath.csi.k8s.io
 allowVolumeExpansion: true
 reclaimPolicy: Retain
 volumeBindingMode: WaitForFirstConsumer
diff --git a/simplyblock_core/scripts/charts/values-template.yaml b/simplyblock_core/scripts/charts/values-template.yaml
deleted file mode 100644
index 79693e7cd..000000000
--- a/simplyblock_core/scripts/charts/values-template.yaml
+++ /dev/null
@@ -1,194 +0,0 @@
-graylog:
-  rootPasswordSha2: "${GRAYLOG_ROOT_PASSWORD_SHA2}"
-  passwordSecret: "${GRAYLOG_PASSWORD_SECRET}"
-
-cluster:
-  secret: "${CLUSTER_SECRET}"
-  id: "${CLUSTER_ID}"
-  ip: "${CLUSTER_IP}"
-
-monitoring:
-  enabled: ${ENABLE_MONITORING}
-
-log:
-  deletionInterval: "${LOG_DELETION_INTERVAL}"
-  retentionPeriod: "${RETENTION_PERIOD}"
-  level: "${LOG_LEVEL}"
-  maxNumberIndex: "${MAX_NUMBER_OF_INDICES}"
-
-grafana:
-  endpoint: "${GRAFANA_ENDPOINT}"
-  contactPoint: "${CONTACT_POINT}"
-
-image:
-  simplyblock: 
-    repository: "${SIMPLYBLOCK_REPOSITORY}"
-    tag: "${SIMPLYBLOCK_TAG}"
-    pullPolicy: "Always"
-
-openebs:
-  enabled: true
-
-mongodb:
-  name: "simplyblock-mongodb"
-  deployment_name: "simplyblock-mongodb"
-  resources:
-    requests:
-      cpu: 100m
-      memory: 300Mi
-    limits:
-      cpu: 250m
-      memory: 1Gi
-  affinity:
-    podAntiAffinity:
-      requiredDuringSchedulingIgnoredDuringExecution:
-        - labelSelector:
-            matchExpressions:
-              - key: app.kubernetes.io/component
-                operator: In
-                values:
-                  - mongodb
-          topologyKey: "kubernetes.io/hostname"
-
-opensearch:
-  fullnameOverride: "simplyblock-opensearch"
-  singleNode: true
-  replicas: 1
-
-  antiAffinity: "hard"
-  persistence:
-    enabled: true
-    storageClass: openebs-local-hostpath
-    size: 10Gi
-
-  resources:
-    requests:
-      cpu: "100m"
-      memory: "512Mi"
-    limits:
-      cpu: "500m"
-      memory: "3Gi"
-
-  extraEnvs:
-    - name: OPENSEARCH_JAVA_OPTS
-      value: "-Xms1g -Xmx1g"
-    - name: bootstrap.memory_lock
-      value: "true"
-    - name: action.auto_create_index
-      value: "false"
-    - name: plugins.security.ssl.http.enabled
-      value: "false"
-    - name: plugins.security.disabled
-      value: "true"
-
-  securityConfig:
-    enabled: false
-
-prometheus:
-  server:
-    fullnameOverride: simplyblock-prometheus
-    enabled: true
-    statefulSet:
-      enabled: true
-    name: simplyblock-prometheus
-    replicaCount: 1
-    podLabels:
-      app: simplyblock-prometheus
-    podAnnotations: {}
-    affinity:
-      podAntiAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-                - key: app.kubernetes.io/component
-                  operator: In
-                  values:
-                    - simplyblock-prometheus
-            topologyKey: "kubernetes.io/hostname"
-    service:
-      servicePort: 9090
-      type: ClusterIP
-      gRPC:
-        enabled: true
-        servicePort: 10901
-      additionalPorts:
-        - name: http-thanos
-          port: 10902
-          targetPort: 10902
-          protocol: TCP
-    securityContext:
-      fsGroup: 65534
-    persistentVolume:
-      enabled: true
-      size: 5Gi
-      storageClass: openebs-local-hostpath
-    extraArgs:
-      storage.tsdb.min-block-duration: 2h
-      storage.tsdb.max-block-duration: 2h
-    sidecarContainers:
-      thanos-sidecar:
-        image: thanosio/thanos:v0.31.0
-        args:
-          - sidecar
-          - --tsdb.path=/prometheus
-          - --prometheus.url=http://localhost:9090
-          - --objstore.config-file=/etc/thanos/objstore.yml
-        ports:
-          - name: grpc
-            containerPort: 10901
-          - name: http
-            containerPort: 10902
-        volumeMounts:
-          - name: storage-volume
-            mountPath: /prometheus
-          - name: objstore-config
-            mountPath: /etc/thanos
-        resources:
-          requests:
-            cpu: "100m"
-            memory: "256Mi"
-          limits:
-            cpu: "250m"
-            memory: "1Gi"
-    resources:
-      requests:
-        cpu: "100m"
-        memory: "512Mi"
-      limits:
-        cpu: "500m"
-        memory: "1Gi"
-    configMapOverrideName: simplyblock-prometheus-config
-    extraVolumes:
-      - name: objstore-config
-        configMap:
-          name: simplyblock-objstore-config
-  alertmanager:
-    enabled: false
-
-  prometheus-pushgateway:
-    enabled: false
-
-  prometheus-node-exporter:
-    enabled: false
-
-  kube-state-metrics:
-    enabled: false
-
-ingress:
-  enabled: true
-  ingressClassName: nginx
-  useDNS: ${USE_DNS}
-  host: "${DNS_NAME}"
-  tlsSecret: ${TLS_SECRET}
-  controller:
-    hostNetwork: ${USE_HOST}  
-    dnsPolicy: ClusterFirstWithHostNet
-    service:
-      type: ${SERVICE_TYPE}
-      nodePorts:
-        tcp:
-          4501: 32451 
-    extraArgs:
-      tcp-services-configmap: "${K8S_NAMESPACE}/simplyblock-tcp-services"
-    nodeSelector:
-      simplyblock.io/role: mgmt-plane
diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml
index 994e9a21f..0b70f321e 100644
--- a/simplyblock_core/scripts/charts/values.yaml
+++ b/simplyblock_core/scripts/charts/values.yaml
@@ -28,9 +28,6 @@ ports:
 storageclass:  
   allowedTopologyZones: []
 
-openebs:
-  enabled: true
-
 foundationdb:
   multiAZ: false
 
@@ -63,7 +60,7 @@ opensearch:
   antiAffinity: "hard"
   persistence:
     enabled: true
-    storageClass: openebs-local-hostpath
+    storageClass: local-hostpath
     size: 10Gi
 
   resources:
@@ -129,7 +126,7 @@ prometheus:
     persistentVolume:
       enabled: true
       size: 5Gi
-      storageClass: openebs-local-hostpath
+      storageClass: local-hostpath
     extraArgs:
       storage.tsdb.min-block-duration: 2h
       storage.tsdb.max-block-duration: 2h

From ce6ae0f17ff1acc55a6078cc79f9762a53db124d Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Mon, 17 Nov 2025 14:58:26 +0100
Subject: [PATCH 17/68] updated to distributed provisioning (#748)

* updated to distributed provisioning

* remove host storage capacity check
---
 .../templates/csi-hostpath-driverinfo.yaml    |   2 +-
 .../charts/templates/csi-hostpath-node.yaml   | 163 ------------------
 ...ntroller.yaml => csi-hostpath-plugin.yaml} | 144 +++++++++-------
 3 files changed, 81 insertions(+), 228 deletions(-)
 delete mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml
 rename simplyblock_core/scripts/charts/templates/{csi-hostpath-controller.yaml => csi-hostpath-plugin.yaml} (83%)

diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
index c02431500..2a9d7d044 100644
--- a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
+++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
@@ -17,7 +17,7 @@ spec:
   podInfoOnMount: true
   # No attacher needed.
   attachRequired: false
-  storageCapacity: true
+  storageCapacity: false
   # Kubernetes may use fsGroup to change permissions and ownership 
   # of the volume to match user requested fsGroup in the pod's SecurityPolicy
   fsGroupPolicy: File
diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml
deleted file mode 100644
index 07e08f36e..000000000
--- a/simplyblock_core/scripts/charts/templates/csi-hostpath-node.yaml
+++ /dev/null
@@ -1,163 +0,0 @@
-
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: csi-hostpath-node-sa
-  namespace: {{ .Release.Namespace }}
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRole
-metadata:
-  name: csi-hostpath-node
-rules:
-  - apiGroups: [""]
-    resources: ["nodes"]
-    verbs: ["get", "list", "watch"]
-  - apiGroups: ["storage.k8s.io"]
-    resources: ["volumeattachments"]
-    verbs: ["get", "list", "watch", "update"]
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: ClusterRoleBinding
-metadata:
-  name: csi-hostpath-node
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: csi-hostpath-node
-subjects:
-  - kind: ServiceAccount
-    name: csi-hostpath-node-sa
-    namespace: {{ .Release.Namespace }}
----
-kind: DaemonSet
-apiVersion: apps/v1
-metadata:
-  name: csi-hostpathplugin
-  labels:
-    app.kubernetes.io/instance: hostpath.csi.k8s.io
-    app.kubernetes.io/part-of: csi-driver-host-path
-    app.kubernetes.io/name: csi-hostpathplugin
-    app.kubernetes.io/component: plugin
-spec:
-  selector:
-    matchLabels:
-      app.kubernetes.io/instance: hostpath.csi.k8s.io
-      app.kubernetes.io/part-of: csi-driver-host-path
-      app.kubernetes.io/name: csi-hostpathplugin
-      app.kubernetes.io/component: plugin
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/instance: hostpath.csi.k8s.io
-        app.kubernetes.io/part-of: csi-driver-host-path
-        app.kubernetes.io/name: csi-hostpathplugin
-        app.kubernetes.io/component: plugin
-    spec:
-      serviceAccountName: csi-hostpath-node-sa
-      containers:
-        - name: node-driver-registrar
-          image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0
-          args:
-            - --v=5
-            - --csi-address=/csi/csi.sock
-            - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock
-          securityContext:
-            # This is necessary only for systems with SELinux, where
-            # non-privileged sidecar containers cannot access unix domain socket
-            # created by privileged CSI driver container.
-            privileged: true
-          env:
-            - name: KUBE_NODE_NAME
-              valueFrom:
-                fieldRef:
-                  apiVersion: v1
-                  fieldPath: spec.nodeName
-          volumeMounts:
-          - mountPath: /csi
-            name: socket-dir
-          - mountPath: /registration
-            name: registration-dir
-          - mountPath: /csi-data-dir
-            name: csi-data-dir
-
-        - name: hostpath
-          image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0
-          args:
-            - --drivername=hostpath.csi.k8s.io
-            - --v=5
-            - --endpoint=$(CSI_ENDPOINT)
-            - --nodeid=$(KUBE_NODE_NAME)
-            - --capacity=slow=10Gi
-            - --capacity=fast=100Gi
-          env:
-            - name: CSI_ENDPOINT
-              value: unix:///csi/csi.sock
-            - name: KUBE_NODE_NAME
-              valueFrom:
-                fieldRef:
-                  apiVersion: v1
-                  fieldPath: spec.nodeName
-          securityContext:
-            privileged: true
-          ports:
-          - containerPort: 9898
-            name: healthz
-            protocol: TCP
-          livenessProbe:
-            failureThreshold: 5
-            httpGet:
-              path: /healthz
-              port: healthz
-            initialDelaySeconds: 10
-            timeoutSeconds: 3
-            periodSeconds: 2
-          volumeMounts:
-            - mountPath: /csi
-              name: socket-dir
-            - mountPath: /var/lib/kubelet/pods
-              mountPropagation: Bidirectional
-              name: mountpoint-dir
-            - mountPath: /var/lib/kubelet/plugins
-              mountPropagation: Bidirectional
-              name: plugins-dir
-            - mountPath: /csi-data-dir
-              name: csi-data-dir
-            - mountPath: /dev
-              name: dev-dir
-        - name: liveness-probe
-          volumeMounts:
-          - mountPath: /csi
-            name: socket-dir
-          image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0
-          args:
-          - --csi-address=/csi/csi.sock
-          - --health-port=9898
-
-      volumes:
-        - hostPath:
-            path: /var/lib/kubelet/plugins/csi-hostpath
-            type: DirectoryOrCreate
-          name: socket-dir
-        - hostPath:
-            path: /var/lib/kubelet/pods
-            type: DirectoryOrCreate
-          name: mountpoint-dir
-        - hostPath:
-            path: /var/lib/kubelet/plugins_registry
-            type: Directory
-          name: registration-dir
-        - hostPath:
-            path: /var/lib/kubelet/plugins
-            type: Directory
-          name: plugins-dir
-        - hostPath:
-            # 'path' is where PV data is persisted on host.
-            # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot
-            path: /var/lib/csi-hostpath-data/
-            type: DirectoryOrCreate
-          name: csi-data-dir
-        - hostPath:
-            path: /dev
-            type: Directory
-          name: dev-dir
diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml
similarity index 83%
rename from simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml
rename to simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml
index 153c29bda..8e695e593 100644
--- a/simplyblock_core/scripts/charts/templates/csi-hostpath-controller.yaml
+++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml
@@ -52,8 +52,9 @@ subjects:
   - kind: ServiceAccount
     name: csi-hostpathplugin-sa
     namespace: {{ .Release.Namespace }}
+
 ---
-kind: StatefulSet
+kind: DaemonSet
 apiVersion: apps/v1
 metadata:
   name: csi-hostpathplugin
@@ -63,11 +64,6 @@ metadata:
     app.kubernetes.io/name: csi-hostpathplugin
     app.kubernetes.io/component: plugin
 spec:
-  serviceName: "csi-hostpathplugin"
-  # One replica only:
-  # Host path driver only works when everything runs
-  # on a single node.
-  replicas: 1
   selector:
     matchLabels:
       app.kubernetes.io/instance: hostpath.csi.k8s.io
@@ -84,67 +80,12 @@ spec:
     spec:
       serviceAccountName: csi-hostpathplugin-sa
       containers:
-        - name: hostpath
-          image: registry.k8s.io/sig-storage/hostpathplugin:v1.17.0
-          args:
-            - "--drivername=hostpath.csi.k8s.io"
-            - "--v=5"
-            - "--endpoint=$(CSI_ENDPOINT)"
-            - "--nodeid=$(KUBE_NODE_NAME)"
-            # end hostpath args
-          env:
-            - name: CSI_ENDPOINT
-              value: unix:///csi/csi.sock
-            - name: KUBE_NODE_NAME
-              valueFrom:
-                fieldRef:
-                  apiVersion: v1
-                  fieldPath: spec.nodeName
-          securityContext:
-            privileged: true
-          ports:
-          - containerPort: 9898
-            name: healthz
-            protocol: TCP
-          livenessProbe:
-            failureThreshold: 5
-            httpGet:
-              path: /healthz
-              port: healthz
-            initialDelaySeconds: 10
-            timeoutSeconds: 3
-            periodSeconds: 2
-          volumeMounts:
-            - mountPath: /csi
-              name: socket-dir
-            - mountPath: /var/lib/kubelet/pods
-              mountPropagation: Bidirectional
-              name: mountpoint-dir
-            - mountPath: /var/lib/kubelet/plugins
-              mountPropagation: Bidirectional
-              name: plugins-dir
-            - mountPath: /csi-data-dir
-              name: csi-data-dir
-            - mountPath: /dev
-              name: dev-dir
-
-        - name: liveness-probe
-          volumeMounts:
-          - mountPath: /csi
-            name: socket-dir
-          image: registry.k8s.io/sig-storage/livenessprobe:v2.17.0
-          args:
-          - --csi-address=/csi/csi.sock
-          - --health-port=9898
-
         - name: csi-provisioner
           image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0
           args:
             - -v=5
             - --csi-address=/csi/csi.sock
             - --feature-gates=Topology=true
-            - --enable-capacity
-            - --capacity-ownerref-level=0 # pod is owner
             - --node-deployment=true
             - --strict-topology=true
             - --immediate-topology=false
@@ -163,7 +104,6 @@ spec:
             valueFrom:
               fieldRef:
                 fieldPath: metadata.name
-            # end csi-provisioner args
           securityContext:
             # This is necessary only for systems with SELinux, where
             # non-privileged sidecar containers cannot access unix domain socket
@@ -172,7 +112,6 @@ spec:
           volumeMounts:
             - mountPath: /csi
               name: socket-dir
-
         - name: csi-resizer
           image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0
           args:
@@ -187,6 +126,84 @@ spec:
             - mountPath: /csi
               name: socket-dir
 
+        - name: node-driver-registrar
+          image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0
+          args:
+            - --v=5
+            - --csi-address=/csi/csi.sock
+            - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock
+          securityContext:
+            # This is necessary only for systems with SELinux, where
+            # non-privileged sidecar containers cannot access unix domain socket
+            # created by privileged CSI driver container.
+            privileged: true
+          env:
+            - name: KUBE_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: spec.nodeName
+          volumeMounts:
+          - mountPath: /csi
+            name: socket-dir
+          - mountPath: /registration
+            name: registration-dir
+          - mountPath: /csi-data-dir
+            name: csi-data-dir
+
+        - name: hostpath
+          image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0
+          args:
+            - --drivername=hostpath.csi.k8s.io
+            - --v=5
+            - --endpoint=$(CSI_ENDPOINT)
+            - --nodeid=$(KUBE_NODE_NAME)
+            - --capacity=slow=10Gi
+            - --capacity=fast=100Gi
+          env:
+            - name: CSI_ENDPOINT
+              value: unix:///csi/csi.sock
+            - name: KUBE_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: spec.nodeName
+          securityContext:
+            privileged: true
+          ports:
+          - containerPort: 9898
+            name: healthz
+            protocol: TCP
+          livenessProbe:
+            failureThreshold: 5
+            httpGet:
+              path: /healthz
+              port: healthz
+            initialDelaySeconds: 10
+            timeoutSeconds: 3
+            periodSeconds: 2
+          volumeMounts:
+            - mountPath: /csi
+              name: socket-dir
+            - mountPath: /var/lib/kubelet/pods
+              mountPropagation: Bidirectional
+              name: mountpoint-dir
+            - mountPath: /var/lib/kubelet/plugins
+              mountPropagation: Bidirectional
+              name: plugins-dir
+            - mountPath: /csi-data-dir
+              name: csi-data-dir
+            - mountPath: /dev
+              name: dev-dir
+        - name: liveness-probe
+          volumeMounts:
+          - mountPath: /csi
+            name: socket-dir
+          image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0
+          args:
+          - --csi-address=/csi/csi.sock
+          - --health-port=9898
+
       volumes:
         - hostPath:
             path: /var/lib/kubelet/plugins/csi-hostpath
@@ -214,4 +231,3 @@ spec:
             path: /dev
             type: Directory
           name: dev-dir
-        # end csi volumes

From 5596c1179092f036d28911a4f449896b5ae8c1be Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Mon, 17 Nov 2025 16:41:17 +0100
Subject: [PATCH 18/68] Update Dockerfile_base (#750)

---
 docker/Dockerfile_base | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base
index 201d92759..735d331b1 100644
--- a/docker/Dockerfile_base
+++ b/docker/Dockerfile_base
@@ -39,4 +39,3 @@ COPY requirements.txt requirements.txt
 
 RUN pip3 install -r requirements.txt
 
-RUN rm -rf /usr/share/terminfo/

From aaa9b420e01d8a1b80f7a79aee45095e69af0af5 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Tue, 18 Nov 2025 12:52:08 +0100
Subject: [PATCH 19/68] sleep after openshift core isolation until reboot
 (#753)

* sleep after openshift core isolation until reboot

* increaased timeout to 3mins

* check and remove old job if found

* check and remove old job if found
---
 .../api/internal/storage_node/kubernetes.py   | 28 ++++++++++++++++++-
 simplyblock_web/node_utils_k8s.py             | 19 ++++++++++++-
 .../oc_storage_core_isolation.yaml.j2         | 15 +++++++++-
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py
index 56b4ca563..b6ab71b63 100644
--- a/simplyblock_web/api/internal/storage_node/kubernetes.py
+++ b/simplyblock_web/api/internal/storage_node/kubernetes.py
@@ -423,9 +423,35 @@ def spdk_process_start(body: SPDKParams):
             logger.info(f"Job deleted: '{core_resp.metadata.name}' in namespace '{namespace}")
 
         elif core_isolate and openshift:
+            batch_v1 = core_utils.get_k8s_batch_client()
+            try:
+                batch_v1.read_namespaced_job(
+                    name=node_prepration_core_name,
+                    namespace=namespace
+                )
+                logger.info(f"Existing Job '{node_prepration_core_name}' found — deleting it first...")
+
+                batch_v1.delete_namespaced_job(
+                    name=node_prepration_core_name,
+                    namespace=namespace,
+                    body=V1DeleteOptions(
+                        propagation_policy='Foreground',
+                        grace_period_seconds=0
+                    )
+                )
+
+                node_utils_k8s.wait_for_job_deletion(node_prepration_core_name, namespace)
+
+                logger.info(f"Old Job '{node_prepration_core_name}' fully deleted.")
+
+            except ApiException as e:
+                if e.status == 404:
+                    logger.info(f"No pre-existing Job '{node_prepration_core_name}' found. Proceeding.")
+                else:
+                    raise
+                
             core_template = env.get_template('oc_storage_core_isolation.yaml.j2')
             core_yaml = yaml.safe_load(core_template.render(values))
-            batch_v1 = core_utils.get_k8s_batch_client()
             core_resp = batch_v1.create_namespaced_job(namespace=namespace, body=core_yaml)
             msg = f"Job created: '{core_resp.metadata.name}' in namespace '{namespace}"
             logger.info(msg)
diff --git a/simplyblock_web/node_utils_k8s.py b/simplyblock_web/node_utils_k8s.py
index 4626a89c9..b1440744d 100644
--- a/simplyblock_web/node_utils_k8s.py
+++ b/simplyblock_web/node_utils_k8s.py
@@ -5,6 +5,7 @@
 import time
 
 from simplyblock_core.utils import get_k8s_batch_client
+from kubernetes.client import ApiException
 
 
 node_name = os.environ.get("HOSTNAME")
@@ -23,7 +24,7 @@ def get_namespace():
             return out
     return default_namespace
 
-def wait_for_job_completion(job_name, namespace, timeout=60):
+def wait_for_job_completion(job_name, namespace, timeout=180):
     batch_v1 = get_k8s_batch_client()
     for _ in range(timeout):
         job = batch_v1.read_namespaced_job(job_name, namespace)
@@ -33,3 +34,19 @@ def wait_for_job_completion(job_name, namespace, timeout=60):
             raise RuntimeError(f"Job '{job_name}' failed")
         time.sleep(3)
     raise TimeoutError(f"Timeout waiting for Job '{job_name}' to complete")
+
+def wait_for_job_deletion(job_name, namespace, timeout=60):
+    batch_v1 = get_k8s_batch_client()
+
+    for _ in range(timeout):
+        try:
+            batch_v1.read_namespaced_job(job_name, namespace)
+        except ApiException as e:
+            if e.status == 404:
+                return True
+            else:
+                raise
+
+        time.sleep(2)
+
+    raise TimeoutError(f"Timeout waiting for Job '{job_name}' to be deleted")
diff --git a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2
index 734d9c59e..74f66721d 100644
--- a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2
+++ b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2
@@ -34,9 +34,18 @@ spec:
             - |
               set -e
 
+              MARKER="/var/simplyblock/.cpu_isolation_applied"
+
               echo "--- Installing jq ---"
               apk add --no-cache jq
 
+              echo "--- Checking if node was already configured ---"
+
+              if [[ -f "$MARKER" ]]; then
+                  echo "[INFO] Node already configured. Skipping sleep and exiting..."
+                  exit 0
+              fi
+
               echo "--- Reading isolated cores from config ---"
               CONFIG_FILE="/var/simplyblock/sn_config_file"
 
@@ -105,4 +114,8 @@ spec:
 
               echo "[INFO] Init setup and CPU isolation complete."
               
-              echo "--- Init setup complete ---"
+              echo "[INFO] Marking node as configured."
+              touch "$MARKER"
+
+              echo "[INFO] Node is rebooting. Sleeping indefinitely to stop pipeline..."
+              sleep infinity

From dbb1bd4b0e8205c930d8ce17f20f0efa1d128e78 Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Wed, 19 Nov 2025 09:43:40 +0300
Subject: [PATCH 20/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 .../controllers/lvol_migration_controller.py  | 358 ++++++++++++------
 simplyblock_core/models/lvol_migration.py     |  55 ++-
 simplyblock_core/models/lvol_model.py         |   1 +
 simplyblock_core/models/snapshot.py           |   1 +
 4 files changed, 273 insertions(+), 142 deletions(-)

diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
index 9a4067943..51aea303d 100644
--- a/simplyblock_core/controllers/lvol_migration_controller.py
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -1,122 +1,252 @@
 import asyncio
-from typing import Optional
+import logging
+from operator import truediv
+from os import MFD_ALLOW_SEALING
+
+from e2e.utils.get_lba_diff_report import fetch_files
+from ..models.lvol_migration import *
+from dataclasses import dataclass
+from typing import List, Optional
+from simplyblock_core.rpc_client import RPCClient
+from simplyblock_core.storage_node_ops import *
+from simplyblock_core.db_controller import *
+from simplyblock_core.models.lvol_model import LVol
+from simplyblock_core.models.storage_node import StorageNode
+from simplyblock_core.models.lvol_migration import Snapshot
+from simplyblock_core.models.snapshot import SnapShot
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
 
-from ..models.lvol_migration import (
-    MigrationObject, MigrationStream, Snapshot,
-    MigrationState, StreamState, ObjectMigrationState
-)
 
 # ---------------------------------------------------------------------------
-# CONTROLLER
+# Migration Service
+# ---------------------------------------------------------------------------
+
+class MigrationQueueObjectType:
+    SNAPSHOT = "snapshot"
+    CLONE = "clone"
+    LVOL = "lvol"
+
+
+@dataclass
+class MigrationQueueObject:
+    obj: object
+    type: str
+    status: ObjectMigrationState = ObjectMigrationState.NEW
+    retries: int = 0
+    last_offset: Optional[int] = None  # For snapshot continuation
+
+
+class MigrationQueue:
+    """Queue holding migration objects for a single LVOL."""
+
+    def __init__(self):
+        self.objects: List[MigrationQueueObject] = []
+
+    def add(self, obj, obj_type, status=ObjectMigrationState.NEW):
+        mqo = MigrationQueueObject(obj=obj, type=obj_type, status=status)
+        self.objects.append(mqo)
+        return mqo
+
+    def reset(self):
+        self.objects.clear()
+
+
+class MigrationService:
+    """Service containing core migration logic."""
+
+    MAX_RETRIES = 3
+    RETRY_DELAY = 5  # seconds, can be increased exponentially
+
+    async def migrate_object(self, mqo: MigrationQueueObject, target_node: str, secondary_node: str):
+        """Perform actual migration of snapshot/clone/lvol."""
+        try:
+            mqo.status = ObjectMigrationState.RUNNING
+            logger.info(f"Starting migration of {mqo.type} {getattr(mqo.obj, 'name', None)}")
+
+            # Simulate RPC / async migration
+            await asyncio.sleep(0.1)  # replace with actual migration RPC
+
+            # Example: if snapshot, migrate from source to target subsystem
+            # handle last_offset, retries, errors here
+
+            mqo.status = ObjectMigrationState.DONE
+            logger.info(f"Completed migration of {mqo.type} {getattr(mqo.obj, 'name', None)}")
+        except Exception as e:
+            logger.error(f"Error migrating {mqo.type}: {e}")
+            mqo.status = ObjectMigrationState.SUSPENDED
+            if mqo.retries < self.MAX_RETRIES:
+                mqo.retries += 1
+                await asyncio.sleep(self.RETRY_DELAY * mqo.retries)
+                await self.migrate_object(mqo, target_node, secondary_node)
+            else:
+                mqo.status = ObjectMigrationState.FAILED
+
+    async def process_migration_queue(self, mq: MigrationQueue, all_nodes_online: callable):
+        """Process the migration queue (snapshots -> clones -> LVOL)."""
+        # Step 1: Snapshots
+        for mqo in mq.objects:
+            if mqo.type == MigrationQueueObjectType.SNAPSHOT and mqo.status in [ObjectMigrationState.NEW,
+                                                                                ObjectMigrationState.RUNNING]:
+                await self.migrate_object(mqo, target_node="primary", secondary_node="secondary")
+
+        if any(mqo.status != ObjectMigrationState.DONE for mqo in mq.objects if
+               mqo.type == MigrationQueueObjectType.SNAPSHOT) or not all_nodes_online():
+            return ObjectMigrationState.CANCELED
+
+        # Step 2: Clones
+        for mqo in mq.objects:
+            if mqo.type == MigrationQueueObjectType.CLONE and mqo.status in [ObjectMigrationState.NEW,
+                                                                             ObjectMigrationState.RUNNING]:
+                await self.migrate_object(mqo, target_node="primary", secondary_node="secondary")
+
+        if any(mqo.status != ObjectMigrationState.DONE for mqo in mq.objects if
+               mqo.type == MigrationQueueObjectType.CLONE) or not all_nodes_online():
+            return ObjectMigrationState.CANCELED
+
+        # Step 3: LVOL
+        for mqo in mq.objects:
+            if mqo.type == MigrationQueueObjectType.LVOL and mqo.status in [ObjectMigrationState.NEW,
+                                                                            ObjectMigrationState.RUNNING]:
+                await self.migrate_object(mqo, target_node="primary", secondary_node="secondary")
+
+        if any(mqo.status != ObjectMigrationState.DONE for mqo in mq.objects if
+               mqo.type == MigrationQueueObjectType.LVOL) or not all_nodes_online():
+            return ObjectMigrationState.CANCELED
+
+        return ObjectMigrationState.DONE
+
+    async def cleanup_migration_queue(self, mq: MigrationQueue):
+        """Remove all partially migrated objects from target."""
+        for mqo in mq.objects:
+            if mqo.status != ObjectMigrationState.NEW:
+                logger.info(f"Cleaning up {mqo.type} {getattr(mqo.obj, 'name', None)} on target")
+                await asyncio.sleep(0.05)  # simulate async delete RPC
+                mqo.status = ObjectMigrationState.CANCELED
+
+        mq.reset()
+
+
+# ---------------------------------------------------------------------------
+# Migration Controller
 # ---------------------------------------------------------------------------
 
 class MigrationController:
+    """Controller orchestrates LVOL migrations."""
+
+    m: MigrationObject
+
+    def assign_lvol(lvol:LVol):
+        m = MigrationObject()
+        m.main_logical_volume.name = lvol.name
+        m.main_logical_volume.state = ObjectMigrationState.NEW
+        m.main_logical_volume.nqn = lvol.nqn
+        m.main_logical_volume.uuid = lvol.uuid
+        m.main_logical_volume.node_id = lvol.hostname
+        if lvol.crypto_bdev != "":
+           m.main_logical_volume.crypto_bdev_name = lvol.crypto_bdev
+        m.main_logical_volume.mapid = 0
+        m.main_logical_volume.namespace_id = lvol.namespace
+        m.main_logical_volume.cloned = lvol.cloned_from_snap
+        return m
+
+    def assign_snap(lvol:LVol, snap: SnapShot):
+        s = Snapshot()
+        s.status = ObjectMigrationState.NEW
+        s.name = snap.name
+        s.source_uuid = snap.snap_uuid
+        return s
+
+    def create_tmp_nqn(self):
+        #create subsystem
+        #create listener
+        #create namespace
+        return
+
+    def delete_tmp_nqn(self):
+        return
+
+    def create_target_object(self, is_lvol: bool):
+
+        return
+
+    def connect_client(node):
+        return RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1)
+
+    def check_nodes_online(n1: StorageNode, n2: StorageNode, n3: StorageNode, n4: StorageNode):
+        if (n1.status == StorageNode.STATUS_ONLINE and
+                n2.status == StorageNode.STATUS_ONLINE and
+                n2.status == StorageNode.STATUS_ONLINE and
+                n3.status == StorageNode.STATUS_ONLINE):
+            return True
+        return False
+
+    def migrate_stream(s: MigrationStream):
+        return
+
+    def cleanup_stream(s: MigrationStream):
+        return
+
+    def cleanup_migration(s: MigrationStream):
+        return
+
+    def migrate_streams(self):
+            for s in self.m.streams:
+                if s.status == StreamState.NEW or s.status == StreamState.RUNNING:
+                    self.migrate_stream(s)
+                if s.status == StreamState.CLEANUP:
+                    self.cleanup_stream(s)
+            self.cleanup_migration(True)
+            partially=False
+            final=MigrationState.DONE
+            for s in self.m.streams:
+                if s.status == StreamState.DONE:
+                    partially=True
+                if s.status == StreamState.FAILED:
+                    final=MigrationState.PARTIALLY_FAILED
+            if not partially:
+                final = MigrationState.FAILED
+            return final
+
+    def check_status_migration(self):
+        return
+
+    def migrate_lvol(self, lvol, target_node: str):
+        """Migrate a logical volume and its snapshots/clones."""
+        db_controller = DBController()
+        lvol.frozen=True
+        lvol.write_to_db(db_controller.kv_store)
+        self.m = self.assign_lvol(lvol)
+        self.m.node_pri = StorageNode(db_controller.get_storage_node_by_id(self.m.main_logical_volume.node_id))
+        self.m.node_sec = self.m.node_pri.secondary_node_id
+        self.m.target_node_pri = StorageNode(db_controller.get_storage_node_by_id(target_node))
+        self.m.target_node_sec = self.m.target_node_pri.secondary_node_id
+        if self.check_nodes_online(self.m.node_pri, self.m.node_sec,self.m.target_node_pri, self.m.target_node_sec):
+            self.rpc_client1 = self.connect_client(self.m.node_pri)
+            self.rpc_client2 = self.connect_client(self.m.target_node_pri)
+            lvols=db_controller.get_lvols_by_node_id(self.m.main_logical_volume.node_id)
+            snapshots=db_controller.get_snapshots()
+            self.m.snapshots = []
+            for s in snapshots:
+                if s.lvol.uuid==self.m.main_logical_volume.uuid:
+                    self.m.snapshots.append(self.assign_snap(s.lvol,s))
+                    s.frozen = True
+                    s.write_to_db(db_controller.kv_store)
+            for l in lvols:
+                if
+
+            #get all snapshots of lvol
+            #get all clones
+            #freeze service
+
+            #now run
+            #fill snapshots
+            #fill lvols
+            #create all streams
+
+            self.migrate_streams()
+
+
+
 
-    def __init__(self, migration: MigrationObject):
-        self.migration = migration
-        if self.migration.completion_poll_queue is None:
-            self.migration.completion_poll_queue = asyncio.Queue()
-
-    # -----------------------------------------------------------------------
-    # START MIGRATION
-    # -----------------------------------------------------------------------
-    async def migrate_start(self):
-        """Entry point: prepare snapshots and streams, start migration."""
-        self.migration.status = MigrationState.PREPARING
-
-        # 1. Check all nodes online (mocked)
-        if not self._nodes_online():
-            self.migration.status = MigrationState.SUSPENDED
-            return
-
-        # 2. Build streams for all logical volumes
-        await self.migrate_prepare()
-
-        self.migration.status = MigrationState.RUNNING
-        await self.migration_iterate_streams()
-
-    async def migrate_prepare(self):
-        """Prepare each logical volume: build streams and snapshot references."""
-        for lv in self.migration.logical_volumes:
-            stream = MigrationStream(
-                lvol_name=lv.name,
-                lvol_state=ObjectMigrationState.NEW,
-                lvol_namespace=lv.namespace_uuid
-            )
-            # Link snapshots if any exist for this LV
-            for snapshot in self.migration.snapshots:
-                if snapshot.name.startswith(lv.name):  # simple match; customize
-                    stream.append_snapshot(snapshot)
-            self.migration.streams.append(stream)
-
-    # -----------------------------------------------------------------------
-    # ITERATE STREAMS
-    # -----------------------------------------------------------------------
-    async def migration_iterate_streams(self):
-        """Iterate over all streams sequentially."""
-        for stream in self.migration.streams:
-            if stream.status not in {StreamState.DONE, StreamState.FAILED}:
-                await self.migrate_stream_start(stream)
-
-        # If all streams done, mark migration done
-        if all(s.status == StreamState.DONE for s in self.migration.streams):
-            self.migration.status = MigrationState.DONE
-
-    # -----------------------------------------------------------------------
-    # STREAM OPERATIONS
-    # -----------------------------------------------------------------------
-    async def migrate_stream_start(self, stream: MigrationStream):
-        """Start migration for a stream."""
-        stream.status = StreamState.RUNNING
-
-        # Iterate snapshots in the stream
-        current = stream.head_snapshot_ref
-        while current:
-            snapshot = current.snapshot
-            if snapshot.status == ObjectMigrationState.NEW:
-                await spdk_set_migration_flag(snapshot.name)
-                await spdk_transfer_snapshot(snapshot.name, stream.lvol_name)
-                snapshot.status = ObjectMigrationState.DONE
-                # Add to completion poll queue
-                await self.migration.completion_poll_queue.put(snapshot.name)
-            current = current.next
-
-        # Once snapshots done, migrate the main LV
-        await self.migrate_stream_resume(stream)
-
-    async def migrate_stream_resume(self, stream: MigrationStream):
-        """Handle LV migration after snapshots."""
-        if stream.lvol_state == ObjectMigrationState.NEW:
-            await spdk_final_lvol_migration(stream.lvol_name)
-            stream.lvol_state = ObjectMigrationState.DONE
-            stream.status = StreamState.DONE
-
-        # Clean up intermediate resources
-        await self.migrate_stream_cleanup(stream)
-
-    async def migrate_stream_cleanup(self, stream: MigrationStream):
-        """Cleanup temporary namespaces, NQNs, etc."""
-        # Placeholder: remove temp subsystems or namespaces
-        await asyncio.sleep(0.01)
-        # No additional state changes needed for this skeleton
-
-    # -----------------------------------------------------------------------
-    # MIGRATION CLEANUP
-    # -----------------------------------------------------------------------
-    async def migrate_cleanup(self, failed: bool = False):
-        """Global migration cleanup."""
-        if failed:
-            self.migration.status = MigrationState.FAILED
-            # Mark streams failed if not done
-            for stream in self.migration.streams:
-                if stream.status != StreamState.DONE:
-                    stream.status = StreamState.FAILED
-        else:
-            self.migration.status = MigrationState.DONE
-
-    # -----------------------------------------------------------------------
-    # HELPER FUNCTIONS
-    # -----------------------------------------------------------------------
-    def _nodes_online(self) -> bool:
-        """Mock node health check."""
-        return True
diff --git a/simplyblock_core/models/lvol_migration.py b/simplyblock_core/models/lvol_migration.py
index 5d9a51c5e..537e226e5 100644
--- a/simplyblock_core/models/lvol_migration.py
+++ b/simplyblock_core/models/lvol_migration.py
@@ -4,6 +4,7 @@
 from typing import List, Optional
 import uuid
 import asyncio
+import storage_node
 
 
 # ---------------------------------------------------------------------------
@@ -33,7 +34,7 @@ class ObjectMigrationState(str, Enum):
     NEW = "new"
     RUNNING = "running"
     SUSPENDED = "suspended"
-    CANCELED = "canceled"
+    CANCELED = "failed"
     DONE = "done"
 
 
@@ -45,7 +46,17 @@ class ObjectMigrationState(str, Enum):
 class LogicalVolumeRef:
     """Reference to a logical volume participating in a migration."""
     name: str  # "LVS/LV"
-    namespace_uuid: str
+    uuid: str
+    namespace_id: str
+    nqn : str
+    node_id: str
+    sec_node_id :str
+    target_node_id : str
+    target_sec_node_id : str
+    mapid: str
+    target_uuid: str
+    cloned : str
+    state : ObjectMigrationState
     crypto_bdev_name: Optional[str] = None
 
 
@@ -80,19 +91,11 @@ class MigrationStream:
     Contains a linked list of snapshot references.
     Tracks only LV migration state and metadata.
     """
-    id: str = field(default_factory=lambda: str(uuid.uuid4()))
-    status: StreamState = StreamState.NEW
-
-    # Logical volume info and per-LV migration metadata
-    lvol_name: Optional[str] = None
-    lvol_state: ObjectMigrationState = ObjectMigrationState.NEW
-    lvol_namespace: Optional[str] = None
-    lvol_nqn: Optional[str] = None
-    lvol_source_uuid: Optional[str] = None
-    lvol_target_uuid: Optional[str] = None
-
+    volume : LogicalVolumeRef
     # Linked list of snapshot references (per-stream)
     head_snapshot_ref: Optional[SnapshotRef] = None
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    status: StreamState = StreamState.NEW
 
     def append_snapshot(self, snapshot: Snapshot):
         """Append a snapshot reference to the stream linked list."""
@@ -122,25 +125,21 @@ class MigrationObject:
     Full migration object, containing multiple streams and logical volumes.
     Snapshots exist independently and are referenced by streams.
     """
-    id: str = field(default_factory=lambda: str(uuid.uuid4()))
-    status: MigrationState = MigrationState.NEW
 
-    primary_source: Optional[str] = None
-    secondary_source: Optional[str] = None
-    primary_target: Optional[str] = None
-    secondary_target: Optional[str] = None
-
-    logical_volumes: List[LogicalVolumeRef] = field(default_factory=list)
-
-    # Top-level subsystem NQN (if any)
-    nqn: Optional[str] = None
-
-    streams: List[MigrationStream] = field(default_factory=list)
+    main_logical_volume : LogicalVolumeRef
+    node_pri : storage_node.StorageNode
+    node_sec: storage_node.StorageNode
+    target_node_pri: storage_node.StorageNode
+    target_node_sec: storage_node.StorageNode
 
+    clones: List[LogicalVolumeRef]
+    streams: List[MigrationStream]
     # Global snapshot objects (shared across streams)
-    snapshots: List[Snapshot] = field(default_factory=list)
-
+    snapshots: List[Snapshot]
     # Async queue for polling migration completion (set externally)
     completion_poll_queue: Optional[asyncio.Queue] = None
 
+    id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    status: MigrationState = MigrationState.NEW
+
 
diff --git a/simplyblock_core/models/lvol_model.py b/simplyblock_core/models/lvol_model.py
index f84091473..fd0ca3356 100644
--- a/simplyblock_core/models/lvol_model.py
+++ b/simplyblock_core/models/lvol_model.py
@@ -66,6 +66,7 @@ class LVol(BaseModel):
     fabric: str = "tcp"
     ndcs: int = 0
     npcs: int = 0
+    frozen: bool = False
 
     def has_qos(self):
         return (self.rw_ios_per_sec > 0 or self.rw_mbytes_per_sec > 0 or self.r_mbytes_per_sec > 0 or self.w_mbytes_per_sec > 0)
diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py
index 1da571ec8..65448038a 100644
--- a/simplyblock_core/models/snapshot.py
+++ b/simplyblock_core/models/snapshot.py
@@ -29,3 +29,4 @@ class SnapShot(BaseModel):
     deletion_status: str = ""
     status: str = ""
     fabric: str = "tcp"
+    frozen: bool = False

From b60925d23cc7c159cb72b774d689060f84e2c648 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 19 Nov 2025 12:09:16 +0100
Subject: [PATCH 21/68] added try and except to patch_prometheus_configmap func
 (#756)

---
 simplyblock_core/utils/__init__.py | 50 ++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py
index 0892db54a..96a00ecac 100644
--- a/simplyblock_core/utils/__init__.py
+++ b/simplyblock_core/utils/__init__.py
@@ -2037,17 +2037,47 @@ def patch_prometheus_configmap(username: str, password: str):
     load_kube_config_with_fallback()
     v1 = client.CoreV1Api()
 
-    cm = v1.read_namespaced_config_map(name="sbcli-simplyblock-prometheus-config", namespace=constants.K8S_NAMESPACE)
-    prometheus_yml = cm.data.get("prometheus.yml", "")
+    try:
+        cm = v1.read_namespaced_config_map(
+            name="sbcli-simplyblock-prometheus-config",
+            namespace=constants.K8S_NAMESPACE
+        )
+    except client.exceptions.ApiException as e:
+        logger.error(f"Failed to read ConfigMap: {e}")
+        return False
+
+    try:
+        prometheus_yml = cm.data.get("prometheus.yml", "")
+        if not prometheus_yml:
+            logger.error("prometheus.yml key not found in ConfigMap.")
+            return False
 
-    prometheus_yml = re.sub(r"username:*", f"username: '{username}'", prometheus_yml)
-    prometheus_yml = re.sub(r"password:*", f"password: '{password}'", prometheus_yml)
+        try:
+            prometheus_yml = re.sub(r"username:.*", f"username: '{username}'", prometheus_yml)
+            prometheus_yml = re.sub(r"password:.*", f"password: '{password}'", prometheus_yml)
+        except re.error as e:
+            logger.error(f"Regex error while patching Prometheus YAML: {e}")
+            return False
 
-    patch_body = {
-        "data": {
-            "prometheus.yml": prometheus_yml
+        patch_body = {
+            "data": {
+                "prometheus.yml": prometheus_yml
+            }
         }
-    }
 
-    v1.patch_namespaced_config_map(name="sbcli-simplyblock-prometheus-config", namespace=constants.K8S_NAMESPACE, body=patch_body)
-    logger.info("Patched sbcli-simplyblock-prometheus-config ConfigMap with new credentials.")
+        v1.patch_namespaced_config_map(
+            name="sbcli-simplyblock-prometheus-config",
+            namespace=constants.K8S_NAMESPACE,
+            body=patch_body
+        )
+
+        logger.info("Patched sbcli-simplyblock-prometheus-config ConfigMap with new credentials.")
+        return True
+
+    except client.exceptions.ApiException as e:
+        logger.error(f"Failed to patch ConfigMap: {e}")
+        return False
+
+    except Exception as e:
+        logger.error(f"Unexpected error while patching ConfigMap: {e}")
+        return False

From f9b2c08ae5dec0b194ad93cbec9829d1a8e50965 Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Wed, 19 Nov 2025 21:04:20 +0300
Subject: [PATCH 22/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 .../controllers/lvol_controller.py            |   4 +
 .../controllers/lvol_migration_controller.py  | 184 +++++++-----------
 .../controllers/snapshot_controller.py        |  33 +++-
 simplyblock_core/models/lvol_migration.py     |  59 ++----
 4 files changed, 116 insertions(+), 164 deletions(-)

diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py
index 4d7a5aad3..2bee37cd9 100644
--- a/simplyblock_core/controllers/lvol_controller.py
+++ b/simplyblock_core/controllers/lvol_controller.py
@@ -865,6 +865,10 @@ def delete_lvol_from_node(lvol_id, node_id, clear_data=True, del_async=False):
     except KeyError:
         return True
 
+    if lvol.frozen:
+        logger.warning(f"lvol in migration. cannot delete lvol {lvol.uuid}")
+        return False
+
     logger.info(f"Deleting LVol:{lvol.get_id()} from node:{snode.get_id()}")
     rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password, timeout=5, retry=2)
 
diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
index 51aea303d..c4c39bc2e 100644
--- a/simplyblock_core/controllers/lvol_migration_controller.py
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -59,73 +59,6 @@ class MigrationService:
     MAX_RETRIES = 3
     RETRY_DELAY = 5  # seconds, can be increased exponentially
 
-    async def migrate_object(self, mqo: MigrationQueueObject, target_node: str, secondary_node: str):
-        """Perform actual migration of snapshot/clone/lvol."""
-        try:
-            mqo.status = ObjectMigrationState.RUNNING
-            logger.info(f"Starting migration of {mqo.type} {getattr(mqo.obj, 'name', None)}")
-
-            # Simulate RPC / async migration
-            await asyncio.sleep(0.1)  # replace with actual migration RPC
-
-            # Example: if snapshot, migrate from source to target subsystem
-            # handle last_offset, retries, errors here
-
-            mqo.status = ObjectMigrationState.DONE
-            logger.info(f"Completed migration of {mqo.type} {getattr(mqo.obj, 'name', None)}")
-        except Exception as e:
-            logger.error(f"Error migrating {mqo.type}: {e}")
-            mqo.status = ObjectMigrationState.SUSPENDED
-            if mqo.retries < self.MAX_RETRIES:
-                mqo.retries += 1
-                await asyncio.sleep(self.RETRY_DELAY * mqo.retries)
-                await self.migrate_object(mqo, target_node, secondary_node)
-            else:
-                mqo.status = ObjectMigrationState.FAILED
-
-    async def process_migration_queue(self, mq: MigrationQueue, all_nodes_online: callable):
-        """Process the migration queue (snapshots -> clones -> LVOL)."""
-        # Step 1: Snapshots
-        for mqo in mq.objects:
-            if mqo.type == MigrationQueueObjectType.SNAPSHOT and mqo.status in [ObjectMigrationState.NEW,
-                                                                                ObjectMigrationState.RUNNING]:
-                await self.migrate_object(mqo, target_node="primary", secondary_node="secondary")
-
-        if any(mqo.status != ObjectMigrationState.DONE for mqo in mq.objects if
-               mqo.type == MigrationQueueObjectType.SNAPSHOT) or not all_nodes_online():
-            return ObjectMigrationState.CANCELED
-
-        # Step 2: Clones
-        for mqo in mq.objects:
-            if mqo.type == MigrationQueueObjectType.CLONE and mqo.status in [ObjectMigrationState.NEW,
-                                                                             ObjectMigrationState.RUNNING]:
-                await self.migrate_object(mqo, target_node="primary", secondary_node="secondary")
-
-        if any(mqo.status != ObjectMigrationState.DONE for mqo in mq.objects if
-               mqo.type == MigrationQueueObjectType.CLONE) or not all_nodes_online():
-            return ObjectMigrationState.CANCELED
-
-        # Step 3: LVOL
-        for mqo in mq.objects:
-            if mqo.type == MigrationQueueObjectType.LVOL and mqo.status in [ObjectMigrationState.NEW,
-                                                                            ObjectMigrationState.RUNNING]:
-                await self.migrate_object(mqo, target_node="primary", secondary_node="secondary")
-
-        if any(mqo.status != ObjectMigrationState.DONE for mqo in mq.objects if
-               mqo.type == MigrationQueueObjectType.LVOL) or not all_nodes_online():
-            return ObjectMigrationState.CANCELED
-
-        return ObjectMigrationState.DONE
-
-    async def cleanup_migration_queue(self, mq: MigrationQueue):
-        """Remove all partially migrated objects from target."""
-        for mqo in mq.objects:
-            if mqo.status != ObjectMigrationState.NEW:
-                logger.info(f"Cleaning up {mqo.type} {getattr(mqo.obj, 'name', None)} on target")
-                await asyncio.sleep(0.05)  # simulate async delete RPC
-                mqo.status = ObjectMigrationState.CANCELED
-
-        mq.reset()
 
 
 # ---------------------------------------------------------------------------
@@ -137,12 +70,18 @@ class MigrationController:
 
     m: MigrationObject
 
-    def assign_lvol(lvol:LVol):
+    def assign_lvol(lvol:LVol, target_lvs: str):
         m = MigrationObject()
-        m.main_logical_volume.name = lvol.name
         m.main_logical_volume.state = ObjectMigrationState.NEW
-        m.main_logical_volume.nqn = lvol.nqn
+
+        #unique identifier:
         m.main_logical_volume.uuid = lvol.uuid
+
+        m.main_logical_volume.bdev_name = lvol.lvol_bdev
+        m.main_logical_volume.lvs_name = lvol.lvs_name
+        m.main_logical_volume.target_lvs_name = target_lvs
+        m.main_logical_volume.nqn = lvol.nqn
+        m.main_logical_volume.source_uuid = lvol.lvol_uuid
         m.main_logical_volume.node_id = lvol.hostname
         if lvol.crypto_bdev != "":
            m.main_logical_volume.crypto_bdev_name = lvol.crypto_bdev
@@ -151,10 +90,14 @@ def assign_lvol(lvol:LVol):
         m.main_logical_volume.cloned = lvol.cloned_from_snap
         return m
 
-    def assign_snap(lvol:LVol, snap: SnapShot):
+    def assign_snap(lvol: LogicalVolumeRef, snap: SnapShot, target_lvs: str):
         s = Snapshot()
         s.status = ObjectMigrationState.NEW
-        s.name = snap.name
+        s.bdev_name = snap.snap_bdev.split("/", 1)[1]
+        s.lvs_name = lvol.lvs_name
+        s.target_lvs_name = lvol.target_lvs_name
+        s.target_lvs_name = target_lvs
+        s.uuid = snap.uuid
         s.source_uuid = snap.snap_uuid
         return s
 
@@ -165,10 +108,15 @@ def create_tmp_nqn(self):
         return
 
     def delete_tmp_nqn(self):
+
         return
 
     def create_target_object(self, is_lvol: bool):
+        self.rpc_client2.create_lvol(self, name, size_in_mib, lvs_name, lvol_priority_class=0, ndcs=0, npcs=0):
+        ef
+        create_lvol(self, name, size_in_mib, lvs_name, lvol_priority_class=0, ndcs=0, npcs=0):
 
+        self.rpc.client2.lvol_set_migration_flag()
         return
 
     def connect_client(node):
@@ -182,71 +130,75 @@ def check_nodes_online(n1: StorageNode, n2: StorageNode, n3: StorageNode, n4: St
             return True
         return False
 
-    def migrate_stream(s: MigrationStream):
-        return
+    def migrate_snaps(self):
+        if self.m.status==MigrationState.RUNNING:
+            for s in self.m.snapshots:
+                if s.snapshot.status==ObjectMigrationState.NEW:
+                   s.snapshot.target_uuid=self.create_target_lvol(s.snapshot.name)
+                   s.snapshot.temporary_nqn,s.snapshot.temporary_namespace=self.create_tmp_nqn(s.snapshot.target_uuid)
+
+                elif s.snapshot.status==ObjectMigrationState.SUSPENDED:
+
+
+
 
-    def cleanup_stream(s: MigrationStream):
         return
 
-    def cleanup_migration(s: MigrationStream):
+    def migrate_lvol(self):
         return
 
-    def migrate_streams(self):
-            for s in self.m.streams:
-                if s.status == StreamState.NEW or s.status == StreamState.RUNNING:
-                    self.migrate_stream(s)
-                if s.status == StreamState.CLEANUP:
-                    self.cleanup_stream(s)
-            self.cleanup_migration(True)
-            partially=False
-            final=MigrationState.DONE
-            for s in self.m.streams:
-                if s.status == StreamState.DONE:
-                    partially=True
-                if s.status == StreamState.FAILED:
-                    final=MigrationState.PARTIALLY_FAILED
-            if not partially:
-                final = MigrationState.FAILED
-            return final
+    def cleanup_migration(status: bool):
+        return
 
     def check_status_migration(self):
         return
 
     def migrate_lvol(self, lvol, target_node: str):
         """Migrate a logical volume and its snapshots/clones."""
+        self.m.status = MigrationState.NEW
+
+        #update lvol: frozen means it cannot be deleted or resized. new snapshots cannot be taken.
         db_controller = DBController()
         lvol.frozen=True
         lvol.write_to_db(db_controller.kv_store)
-        self.m = self.assign_lvol(lvol)
+
+        #get all 4 storage node objects: primary, secondary source and target
         self.m.node_pri = StorageNode(db_controller.get_storage_node_by_id(self.m.main_logical_volume.node_id))
         self.m.node_sec = self.m.node_pri.secondary_node_id
         self.m.target_node_pri = StorageNode(db_controller.get_storage_node_by_id(target_node))
         self.m.target_node_sec = self.m.target_node_pri.secondary_node_id
-        if self.check_nodes_online(self.m.node_pri, self.m.node_sec,self.m.target_node_pri, self.m.target_node_sec):
-            self.rpc_client1 = self.connect_client(self.m.node_pri)
-            self.rpc_client2 = self.connect_client(self.m.target_node_pri)
-            lvols=db_controller.get_lvols_by_node_id(self.m.main_logical_volume.node_id)
-            snapshots=db_controller.get_snapshots()
-            self.m.snapshots = []
-            for s in snapshots:
-                if s.lvol.uuid==self.m.main_logical_volume.uuid:
-                    self.m.snapshots.append(self.assign_snap(s.lvol,s))
-                    s.frozen = True
-                    s.write_to_db(db_controller.kv_store)
-            for l in lvols:
-                if
 
-            #get all snapshots of lvol
-            #get all clones
-            #freeze service
+        #copy now all data from the lvol to the migration lvol (temporary object for lvol during migration)
+        self.m = self.assign_lvol(lvol, self.m.target_node_pri.lvstore)
 
-            #now run
-            #fill snapshots
-            #fill lvols
-            #create all streams
-
-            self.migrate_streams()
+        #create rpc clients for both primaries:
+        self.rpc_client1 = self.connect_client(self.m.node_pri)
+        self.rpc_client2 = self.connect_client(self.m.target_node_pri)
 
+        #now we create a chain of snapshots from all snapshots taken from this lvol
+        snapshots=db_controller.get_snapshots()
+        snapshots.sort(key=lambda s: s.created_at)
+        self.m.snapshots = []
+        sr=None
+        for s in snapshots:
+                if s.lvol.uuid==self.m.main_logical_volume.uuid:
+                     s.frozen=True
+                     #need to reset that one on node restart
+                     s.write_to_db(db_controller.kv_store)
+                     srn=Snapshot()
+                     if sr:
+                       sr.next=srn
+                     sr=srn
+                     sr.snapshot=self.assign_snap(s.lvol,s)
+                     self.m.snapshots.append(sr)
+
+
+        if self.check_nodes_online(self.m.node_pri, self.m.node_sec, self.m.target_node_pri, self.m.target_node_sec):
+            self.m.status = MigrationState.RUNNING
+            self.migrate_snaps()
+        else:
+            logger.warning(f"Not all nodes online. Suspending lvol life migration {lvol.uuid}")
+            self.m.status=MigrationState.SUSPENDED
 
 
 
diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py
index d3eca0e00..97a5c3ef0 100644
--- a/simplyblock_core/controllers/snapshot_controller.py
+++ b/simplyblock_core/controllers/snapshot_controller.py
@@ -13,6 +13,18 @@
 from simplyblock_core.models.storage_node import StorageNode
 from simplyblock_core.rpc_client import RPCClient
 
+import threading
+from collections import defaultdict
+
+# A dictionary to hold locks per node
+node_locks = defaultdict(threading.Lock)
+node_locks_global_lock = threading.Lock()  # protects the node_locks dict
+
+def get_node_lock(node_id):
+    # Ensure thread-safe creation of locks
+    with node_locks_global_lock:
+        return node_locks[node_id]
+
 
 logger = lg.getLogger()
 
@@ -20,12 +32,21 @@
 
 
 def add(lvol_id, snapshot_name):
-    try:
+
+
+  try:
         lvol = db_controller.get_lvol_by_id(lvol_id)
-    except KeyError as e:
+  except KeyError as e:
         logger.error(e)
         return False, str(e)
 
+  if lvol.frozen:
+        logger.warning(f"Lvol in migration: Cannot create snapshot from lvol {lvol.uuid} ")
+        return False
+
+  node_lock = get_node_lock(lvol.node_id)
+  with node_lock:
+
     pool = db_controller.get_pool_by_id(lvol.pool_uuid)
     if pool.status == Pool.STATUS_INACTIVE:
         msg = "Pool is disabled"
@@ -250,6 +271,10 @@ def delete(snapshot_uuid, force_delete=False):
         logger.error(f"Snapshot not found {snapshot_uuid}")
         return False
 
+    if snap.frozen:
+        logger.warning(f"lvol in migration. cannot delete snapshot {snap.uuid}")
+        return False
+
     try:
         snode = db_controller.get_storage_node_by_id(snap.lvol.node_id)
     except KeyError:
@@ -362,6 +387,10 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None
         logger.error(e)
         return False, str(e)
 
+    if snap.frozen:
+        logger.warning(f"lvol in migration. cannot create clone {snap.uuid}")
+        return False
+
     try:
         pool = db_controller.get_pool_by_id(snap.lvol.pool_uuid)
     except KeyError:
diff --git a/simplyblock_core/models/lvol_migration.py b/simplyblock_core/models/lvol_migration.py
index 537e226e5..fd204890a 100644
--- a/simplyblock_core/models/lvol_migration.py
+++ b/simplyblock_core/models/lvol_migration.py
@@ -45,8 +45,13 @@ class ObjectMigrationState(str, Enum):
 @dataclass
 class LogicalVolumeRef:
     """Reference to a logical volume participating in a migration."""
-    name: str  # "LVS/LV"
+
     uuid: str
+    bdev_name: str  # "LVS/LV"
+    lvs_name: str
+    target_lvs_name: str
+    source_uuid: str
+    target_uuid: str
     namespace_id: str
     nqn : str
     node_id: str
@@ -54,21 +59,22 @@ class LogicalVolumeRef:
     target_node_id : str
     target_sec_node_id : str
     mapid: str
-    target_uuid: str
     cloned : str
     state : ObjectMigrationState
     crypto_bdev_name: Optional[str] = None
 
-
 @dataclass
 class Snapshot:
     """
     Global snapshot object, exists only once.
     Stores all per-snapshot migration metadata.
     """
-    name: str  # "LVS/LV"
-    source_uuid: Optional[str] = None
-    target_uuid: Optional[str] = None
+    uuid : str
+    bdev_name: str  # "LVS/LV"
+    lvs_name: str
+    target_lvs_name : str
+    source_uuid : str
+    target_uuid : Optional[str] = None
 
     # Migration metadata
     temporary_nqn: Optional[str] = None
@@ -76,49 +82,12 @@ class Snapshot:
     mapid: Optional[str] = None
     status: ObjectMigrationState = ObjectMigrationState.NEW
 
-
 @dataclass
 class SnapshotRef:
     """Per-stream linked list node referencing a global snapshot."""
     snapshot: Snapshot
     next: Optional["SnapshotRef"] = None
 
-
-@dataclass
-class MigrationStream:
-    """
-    Each migration stream corresponds to one logical volume.
-    Contains a linked list of snapshot references.
-    Tracks only LV migration state and metadata.
-    """
-    volume : LogicalVolumeRef
-    # Linked list of snapshot references (per-stream)
-    head_snapshot_ref: Optional[SnapshotRef] = None
-    id: str = field(default_factory=lambda: str(uuid.uuid4()))
-    status: StreamState = StreamState.NEW
-
-    def append_snapshot(self, snapshot: Snapshot):
-        """Append a snapshot reference to the stream linked list."""
-        ref = SnapshotRef(snapshot=snapshot)
-        if not self.head_snapshot_ref:
-            self.head_snapshot_ref = ref
-            return ref
-        cur = self.head_snapshot_ref
-        while cur.next:
-            cur = cur.next
-        cur.next = ref
-        return ref
-
-    def list_snapshot_names(self) -> List[str]:
-        """Return list of snapshot names in this stream."""
-        names = []
-        cur = self.head_snapshot_ref
-        while cur:
-            names.append(cur.snapshot.name)
-            cur = cur.next
-        return names
-
-
 @dataclass
 class MigrationObject:
     """
@@ -132,10 +101,8 @@ class MigrationObject:
     target_node_pri: storage_node.StorageNode
     target_node_sec: storage_node.StorageNode
 
-    clones: List[LogicalVolumeRef]
-    streams: List[MigrationStream]
     # Global snapshot objects (shared across streams)
-    snapshots: List[Snapshot]
+    snapshots: List[SnapshotRef]
     # Async queue for polling migration completion (set externally)
     completion_poll_queue: Optional[asyncio.Queue] = None
 

From bb90c602bb3852fa6186f54dfdb340acc7dda2d4 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Sun, 23 Nov 2025 09:36:54 +0100
Subject: [PATCH 23/68] added hostNetwork true to simplyblock controlplane
 services (#771)

---
 .../scripts/charts/templates/app_k8s.yaml     | 67 ++++++++++++++-----
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml
index ec2e5b378..d17ea092a 100644
--- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml
+++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml
@@ -19,6 +19,8 @@ spec:
         app: simplyblock-admin-control
     spec:
       serviceAccountName: simplyblock-control-sa
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet 
       containers:
       - name: simplyblock-control
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -133,6 +135,8 @@ spec:
       labels:
         app: simplyblock-storage-node-monitor
     spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet 
       containers:
       - name: storage-node-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -183,6 +187,8 @@ spec:
       labels:
         app: simplyblock-mgmt-node-monitor
     spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet 
       containers:
         - name: mgmt-node-monitor
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -235,6 +241,8 @@ spec:
       labels:
         app: simplyblock-lvol-stats-collector
     spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet 
       containers:
         - name: lvol-stats-collector
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -284,7 +292,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-main-distr-event-collector
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: main-distr-event-collector
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -334,7 +344,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-capacity-and-stats-collector
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: capacity-and-stats-collector
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -385,7 +397,8 @@ spec:
       labels:
         app: simplyblock-capacity-monitor
     spec:
-      
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: capacity-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -435,7 +448,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-health-check
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: health-check
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -485,7 +500,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-device-monitor
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: device-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -535,7 +552,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-lvol-monitor
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: lvol-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -584,7 +603,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-snapshot-monitor
-    spec:     
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet      
       containers:
       - name: snapshot-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -685,7 +706,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-restart
-    spec:     
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet      
       containers:
         - name: tasks-runner-restart
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -735,7 +758,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-migration
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-migration
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -784,7 +809,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-failed-migration
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-failed-migration
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -833,7 +860,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-cluster-status
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-cluster-status
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -882,7 +911,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-new-device-migration
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-new-device-migration
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -931,7 +962,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-node-add-runner
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-node-addrunner
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -983,7 +1016,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-port-allow
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-port-allow
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -1032,7 +1067,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-jc-comp-resume
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-jc-comp-resume
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"

From 855513f4302b0e5c023a8ea4a97d76b217f19919 Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Tue, 25 Nov 2025 00:31:39 +0300
Subject: [PATCH 24/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 .../controllers/lvol_migration_controller.py  | 407 ++++++++++++++----
 .../controllers/snapshot_controller.py        |   2 +
 simplyblock_core/db_controller.py             |  14 +-
 simplyblock_core/models/lvol_migration.py     | 127 +++---
 simplyblock_core/models/snapshot.py           |   1 +
 simplyblock_core/storage_node_ops.py          |  12 +
 6 files changed, 412 insertions(+), 151 deletions(-)

diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
index c4c39bc2e..88da79191 100644
--- a/simplyblock_core/controllers/lvol_migration_controller.py
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -1,22 +1,23 @@
-import asyncio
 import logging
-from operator import truediv
-from os import MFD_ALLOW_SEALING
+from logging import exception
+from time import sleep
 
-from e2e.utils.get_lba_diff_report import fetch_files
+from ..cluster_ops import db_controller
 from ..models.lvol_migration import *
 from dataclasses import dataclass
-from typing import List, Optional
-from simplyblock_core.rpc_client import RPCClient
+from typing import Optional
 from simplyblock_core.storage_node_ops import *
 from simplyblock_core.db_controller import *
 from simplyblock_core.models.lvol_model import LVol
 from simplyblock_core.models.storage_node import StorageNode
 from simplyblock_core.models.lvol_migration import Snapshot
 from simplyblock_core.models.snapshot import SnapShot
+from datetime import datetime
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
+import uuid
+
 
 
 # ---------------------------------------------------------------------------
@@ -65,18 +66,25 @@ class MigrationService:
 # Migration Controller
 # ---------------------------------------------------------------------------
 
+
 class MigrationController:
     """Controller orchestrates LVOL migrations."""
 
-    m: MigrationObject
 
-    def assign_lvol(lvol:LVol, target_lvs: str):
-        m = MigrationObject()
+    def __init__(self):
+        self._stop_event = threading.Event()
+        self.lock = threading.Lock()
+        self.m: MigrationObject = MigrationObject()
+        self.db_controller = DBController()
+        self.prev_time = datetime.now()
+
+    def lvol_assign(self, lvol:LVol, target_lvs: str):
+        m=MigrationObject()
         m.main_logical_volume.state = ObjectMigrationState.NEW
 
         #unique identifier:
+        m.main_logical_volume.retry=0
         m.main_logical_volume.uuid = lvol.uuid
-
         m.main_logical_volume.bdev_name = lvol.lvol_bdev
         m.main_logical_volume.lvs_name = lvol.lvs_name
         m.main_logical_volume.target_lvs_name = target_lvs
@@ -86,119 +94,354 @@ def assign_lvol(lvol:LVol, target_lvs: str):
         if lvol.crypto_bdev != "":
            m.main_logical_volume.crypto_bdev_name = lvol.crypto_bdev
         m.main_logical_volume.mapid = 0
+        m.main_logical_volume.size = lvol.size
+        m.main_logical_volume.ndcs = lvol.ndcs
+        m.main_logical_volume.npcs = lvol.npcs
+        m.main_logical_volume.priority_class = lvol.lvol_priority_class
         m.main_logical_volume.namespace_id = lvol.namespace
         m.main_logical_volume.cloned = lvol.cloned_from_snap
-        return m
+        return m.main_logical_volume
 
-    def assign_snap(lvol: LogicalVolumeRef, snap: SnapShot, target_lvs: str):
+    def snap_assign(lvol: LogicalVolumeRef, snap: SnapShot, target_lvs: str):
         s = Snapshot()
+        s.retry = 0
         s.status = ObjectMigrationState.NEW
         s.bdev_name = snap.snap_bdev.split("/", 1)[1]
         s.lvs_name = lvol.lvs_name
+        s.lvol_size = snap.size
         s.target_lvs_name = lvol.target_lvs_name
         s.target_lvs_name = target_lvs
         s.uuid = snap.uuid
         s.source_uuid = snap.snap_uuid
         return s
 
-    def create_tmp_nqn(self):
-        #create subsystem
-        #create listener
-        #create namespace
-        return
-
-    def delete_tmp_nqn(self):
-
-        return
-
-    def create_target_object(self, is_lvol: bool):
-        self.rpc_client2.create_lvol(self, name, size_in_mib, lvs_name, lvol_priority_class=0, ndcs=0, npcs=0):
-        ef
-        create_lvol(self, name, size_in_mib, lvs_name, lvol_priority_class=0, ndcs=0, npcs=0):
+    def snap_init(self, uuid: str, lvol: LogicalVolumeRef, target_lvs: str):
+        s = Snapshot()
+        s.retry = 0
+        s.status = ObjectMigrationState.NEW
+        s.bdev_name = "MIG_SNAP"
+        s.lvs_name = lvol.lvs_name
+        s.lvol_size = lvol.size
+        s.target_lvs_name = lvol.target_lvs_name
+        s.target_lvs_name = target_lvs
+        s.uuid = uuid
+        s.source_uuid = uuid
+        return s
 
-        self.rpc.client2.lvol_set_migration_flag()
-        return
 
-    def connect_client(node):
+    @property
+    def connect_client(node:StorageNode):
         return RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1)
 
-    def check_nodes_online(n1: StorageNode, n2: StorageNode, n3: StorageNode, n4: StorageNode):
+    def check_nodes_online(self, n1: StorageNode, n2: StorageNode, n3: StorageNode, n4: StorageNode):
         if (n1.status == StorageNode.STATUS_ONLINE and
                 n2.status == StorageNode.STATUS_ONLINE and
-                n2.status == StorageNode.STATUS_ONLINE and
-                n3.status == StorageNode.STATUS_ONLINE):
+                n3.status == StorageNode.STATUS_ONLINE and
+                n4.status == StorageNode.STATUS_ONLINE):
             return True
         return False
 
-    def migrate_snaps(self):
-        if self.m.status==MigrationState.RUNNING:
-            for s in self.m.snapshots:
-                if s.snapshot.status==ObjectMigrationState.NEW:
-                   s.snapshot.target_uuid=self.create_target_lvol(s.snapshot.name)
-                   s.snapshot.temporary_nqn,s.snapshot.temporary_namespace=self.create_tmp_nqn(s.snapshot.target_uuid)
-
-                elif s.snapshot.status==ObjectMigrationState.SUSPENDED:
+    def unfreeze_objects(self):
+        db_controller = DBController()
+        l = db_controller.get_lvol_by_id(self.m.main_logical_volume.uuid)
+        l.frozen = False
+        l.write_to_db(db_controller.kv_store)
+        snaps = db_controller.get_snapshots_by_node_id(self.m.node_pri.uuid)
+        for s in snaps:
+            s.frozen = False
+            s.write_to_db(db_controller.kv_store)
+        return
 
+    def get_transfer_state(self, lvolname: str, node_id: str):
 
+        return
 
+    def export_lvol(self, s: Snapshot):
+        # create subsystem
+        # create listener
+        # create namespace
+        return
 
+    def delete_tmp_nqn(self, s: Snapshot):
         return
 
-    def migrate_lvol(self):
+    def get_lvol_by_name(self, lvol_name):
         return
 
-    def cleanup_migration(status: bool):
+    def create_lvol(self, snap: Snapshot):
+            name = snap.target_lvs_name + "/" + snap.bdev_name
+            if snap.status == ObjectMigrationState.NEW:
+                snap_uuid=self.get_lvol_by_name(name)
+                if not snap_uuid:
+                   snap_uuid = self.rpc_client2.create_lvol(name, snap.size, snap.target_lvs_name,
+                                                         self.m.main_logical_volume.priority_class,
+                                                         self.m.main_logical_volume.ndcs,
+                                                         self.m.main_logical_volume.npcs)
+                if snap_uuid:
+                    snap.target_uuid = snap_uuid
+                    snap.status = ObjectMigrationState.LVOL_CREATED
+                    self.m.write_to_db(self.db_controller.kv_store)
+                else:
+                    raise exception(f"could not create lvol on target. snap: {snap.uuid}")
+            return True
+
+    def set_mig_status(self, snap: Snapshot):
+            name = snap.target_lvs_name + "/" + snap.bdev_name
+            if snap.status == ObjectMigrationState.LVOL_CREATED:
+                if not self.rpc_client2.lvol_set_migration_flag(name):
+                    raise (f"issue creating an target object during migration of snapshot {snap.uuid} ")
+                else:
+                    snap.status = ObjectMigrationState.MIG_FLAG_SET
+                    self.m.write_to_db(self.db_controller.kv_store)
+            return True
+
+    def connect_lvol(self, s: Snapshot):
         return
 
-    def check_status_migration(self):
+    def transfer_data(self, snap: Snapshot, offset: int):
+            self.m.completion_poll_queue.append(snap)
+            return
+
+    def convert_lvol(self, s: Snapshot):
+            return
+
+    def convert_to_snap(self, s1, s2: Snapshot):
+            return
+
+    def create_snapshot(self, lvol: LogicalVolumeRef):
+            return
+
+    def time_difference(self):
+           return (datetime.now()-self.prev_time).total_seconds()
+
+    def create_target_lvol(self, s: Snapshot):
+          return
+
+    def create_final_lvol(self):
         return
 
-    def migrate_lvol(self, lvol, target_node: str):
-        """Migrate a logical volume and its snapshots/clones."""
-        self.m.status = MigrationState.NEW
+    def connect_hublvol(self):
+          return
+
+    def transfer_data_final(self):
+          return
 
-        #update lvol: frozen means it cannot be deleted or resized. new snapshots cannot be taken.
+    def reconnect_subsystems(self):
+         return
+
+    def set_mig_state_lvol(self, s: Snapshot):
+         return
+
+    def cleanup_migration(self, status: bool):
         db_controller = DBController()
-        lvol.frozen=True
-        lvol.write_to_db(db_controller.kv_store)
-
-        #get all 4 storage node objects: primary, secondary source and target
-        self.m.node_pri = StorageNode(db_controller.get_storage_node_by_id(self.m.main_logical_volume.node_id))
-        self.m.node_sec = self.m.node_pri.secondary_node_id
-        self.m.target_node_pri = StorageNode(db_controller.get_storage_node_by_id(target_node))
-        self.m.target_node_sec = self.m.target_node_pri.secondary_node_id
-
-        #copy now all data from the lvol to the migration lvol (temporary object for lvol during migration)
-        self.m = self.assign_lvol(lvol, self.m.target_node_pri.lvstore)
-
-        #create rpc clients for both primaries:
-        self.rpc_client1 = self.connect_client(self.m.node_pri)
-        self.rpc_client2 = self.connect_client(self.m.target_node_pri)
-
-        #now we create a chain of snapshots from all snapshots taken from this lvol
-        snapshots=db_controller.get_snapshots()
-        snapshots.sort(key=lambda s: s.created_at)
-        self.m.snapshots = []
-        sr=None
-        for s in snapshots:
-                if s.lvol.uuid==self.m.main_logical_volume.uuid:
-                     s.frozen=True
-                     #need to reset that one on node restart
-                     s.write_to_db(db_controller.kv_store)
-                     srn=Snapshot()
-                     if sr:
-                       sr.next=srn
-                     sr=srn
-                     sr.snapshot=self.assign_snap(s.lvol,s)
-                     self.m.snapshots.append(sr)
+        real_snapshots = db_controller.get_snapshots()
+        self.unfreeze_objects()
+        #Migration was not successful
+        if not status:
+              return
+        else:
+              return
+        return
+
+    def migrate_final_lvol(self):
+      try:
+        if self.m.status==MigrationState.SNAPS_MIGRATED:
+           self.create_final_lvol()
+        elif self.m.status==MigrationState.TARGET_LVOL_CREATED:
+           self.connect_hublvol()
+        elif self.m.status==MigrationState.HUBLVOL_CONNECTED:
+           self.transfer_data_final()
+        elif self.m.status==MigrationState.TRANSFERRED_TO_TARGET:
+           self.reconnect_subsystems()
+        elif self.m.status == MigrationState.RECONNECT_DONE:
+           self.cleanup_migration(True)
+      except:
+        raise (f"cannot transfer to target: {self.m.main_logical_volume.uuid}")
+      return True
 
 
+    def migrate_snaps(self):
+        if self.m.status==MigrationState.RUNNING:
+          try:
+            all_snaps_done = True
+            p=""
+            for s in self.m.snapshots:
+              if s.status is not ObjectMigrationState.DONE:
+                  all_snaps_done = False
+              if s.status in ObjectMigrationState.NEW:
+                  self.create_target_lvol(s)
+              elif s.status in ObjectMigrationState.LVOL_CREATED:
+                  self.set_mig_state_lvol(s)
+              elif s.status in ObjectMigrationState.MIG_FLAG_SET:
+                  self.export_lvol(s)
+              elif s.status in ObjectMigrationState.LVOL_EXPORTED:
+                  self.connect_lvol(s)
+              elif s.status in ObjectMigrationState.LVOL_CONNECTED:
+                  self.transfer_data(s, 0)
+              elif s.status==ObjectMigrationState.TRANSFERRED:
+                   self.convert_to_snap(s,p)
+              elif s.status == ObjectMigrationState.CONVERTED:
+                   self.delete_tmp_nqn(s)
+              p=s
+            if self.m.rerun < 3 or self.time_difference()>5:
+                snap_uuid=self.create_snapshot(self.m.main_logical_volume)
+                sn=self.snap_init(snap_uuid,self.m.main_logical_volume,self.m.target_node_pri.lvstore)
+                self.m.snapshots.append(sn)
+                self.prev_time=datetime.now()
+                self.migrate_snaps()
+            elif all_snaps_done:
+                self.m.status = MigrationState.SNAPS_MIGRATED
+                self.m.write_to_db(self.db_controller.kv_store)
+                self.migrate_final_lvol()
+          except:
+               self.m.pre_status = self.m.status
+               self.m.status = MigrationState.FAILED
+               self.cleanup_migration(False)
+        return True
+
+    def lvol_migrate(self, lvol: LogicalVolumeRef, target_node: StorageNode, m: MigrationObject=None):
+        """Migrate a logical volume and its snapshots/clones."""
+
+        # if this Migration Object does not exist (first call to lvol_migrate):
+        if not m:
+            self.m = MigrationObject()
+            self.m.uuid = str(uuid.uuid4())
+            self.m.create_dt = str(datetime.datetime)
+            self.m.status = MigrationState.NEW
+            self.m.write_to_db(self.db_controller.kv_store)
+        else:
+            self.m = m
+
+        # update lvol: frozen means it cannot be deleted or resized. new snapshots cannot be taken.
+        try:
+            lvol1=self.db_controller.get_lvol_by_id(lvol.uuid)
+            lvol1.frozen = True
+            lvol1.write_to_db(self.db_controller.kv_store)
+
+            # copy now all data from the lvol to the migration lvol (temporary object for lvol during migration)
+            self.m.main_logical_volume = self.lvol_assign(lvol)
+
+            # get all 4 storage node objects: primary, secondary source and target
+            self.m.node_pri = StorageNode(self.db_controller.get_storage_node_by_id(self.m.main_logical_volume.node_id))
+            self.m.node_sec = self.db_controller.get_storage_node_by_id(self.m.node_pri.secondary_node_id)
+            self.m.target_node_pri = target_node
+            self.m.target_node_sec = self.db_controller.get_storage_node_by_id(self.m.target_node_pri.secondary_node_id)
+
+            # create rpc clients for both primaries:
+            self.rpc_client1 = self.connect_client
+            self.rpc_client2 = self.connect_client
+
+            # now we create a chain of snapshots from all snapshots taken from this lvol
+            snapshots = self.db_controller.get_snapshots()
+            snapshots.sort(key=lambda s: s.created_at)
+            self.m.snapshots = []
+            sr = None
+            for s in snapshots:
+                if s.lvol.uuid == self.m.main_logical_volume.uuid:
+                    s.frozen = True
+                    # need to reset that one on node restart
+                    s.write_to_db(self.db_controller.kv_store)
+                    sr = self.snap_assign(self.m.main_logical_volume, s,  self.db_controller.get(self.m.target_node_pri.lvstore)
+                    self.m.snapshots.append(sr)
+        except:
+            return False
+
         if self.check_nodes_online(self.m.node_pri, self.m.node_sec, self.m.target_node_pri, self.m.target_node_sec):
             self.m.status = MigrationState.RUNNING
+            self.m.write_to_db(self.db_controller.kv_store)
             self.migrate_snaps()
+            return True
         else:
             logger.warning(f"Not all nodes online. Suspending lvol life migration {lvol.uuid}")
-            self.m.status=MigrationState.SUSPENDED
-
-
+            self.m.write_to_db(self.db_controller.kv_store)
+            return False
+
+    def check_status_migration(self, on_restart: bool):
+      while True:
+          sleep(10)
+          try:
+            migrations=self.db_controller.get_migrations()
+            for m in migrations:
+              if m.status!=MigrationState.DONE and m.status!=MigrationState.FAILED:
+                 if self.check_nodes_online(m.node_pri,self.db_controller.get_storage_node_by_id(m.node_pri.secondary_node_id),
+                                            m.target_node_pri,m.target_node_sec):
+                     if (m.status==MigrationState.NEW):
+                         self.lvol_migrate(m.main_logical_volume,m.node_pri,m)
+                     elif (m.status==MigrationState.RUNNING):
+                         for q in m.completion_poll_queue:
+                             m.completion_poll_queue.remove(q)
+                             if (q.status==ObjectMigrationState.TRANSFER):
+                                 if q.retry>5:
+                                     raise (f"could not transfer snapshot. max retries. name: {q.lvs_name+"/"+q.bdev_name}. uuid: {q.uuid}")
+                                 q.retry+=1
+                                 result=self.get_transfer_state(q.target_lvs_name+"/"+q.bdev_name)
+                                 if not result.status:
+                                    self.transfer_data(q,result.offset)
+                                    m.completion_poll_queue.append(q)
+                                 else:
+                                    q.status=ObjectMigrationState.TRANSFERRED
+                             self.migrate_snaps
+                     elif (m.status in (MigrationState.SNAPS_MIGRATED, MigrationState.HUBLVOL_CONNECTED, MigrationState.TARGET_LVOL_CREATED, MigrationState.TRANSFERRED_TO_TARGET, MigrationState.RECONNECT_DONE)):
+                          self.migrate_final_lvol()
+          except:
+              logger.error(f"migration controller exception. Migration failed: {m.uuid} ")
+              m.status=MigrationState.FAILED
+              self.cleanup_migration(m, False)
+              return False
+          return True
+
+    migrate_lock = threading.Lock()
+
+    def add_new_migration(self, lvol, target_node: StorageNode):
+      with self.migrate_lock:
+            try:
+              migrations = self.db_controller.get_migrations()
+              for m in migrations:
+                if lvol.node_id==m.main_logical_volume.node_id and (m.status!=MigrationState.DONE or m.status!=MigrationState.FAILED_AND_CLEANED):
+                   raise exception("cannot add migration - ongoing migration")
+              self.lvol_migrate(lvol, target_node)
+            except:
+              logger.error(f"could not add lvol {lvol.uuid} for migration as another migration is currently running.")
+              return False
+
+        #are all 4 nodes online?
+        #if migration is suspended, resume it. If it was before in
+        #depending on previous state, continue in migrate_snaps, migrate_lvol or cleanup
+        #did total time expire? --> cleanup, failed
+        #any snaps in queue?
+        #poll for completion, trigger restart or if completed change the state
+        #stop
+      return
+
+    def migrations_list(self):
+        db_controller = DBController()
+        migrations = db_controller.get_migrations()
+        data = []
+        for m in migrations:
+            logger.debug(m)
+            data.append({
+                "UUID": m.uuid,
+                "Lvol UUID": m.main_logical_volume.uuid,
+                "Primary (source):": m.node_pri,
+                "Primary (target):": m.target_node_pri,
+                "DateTime:": m.create_dt,
+                "Status": m.status,
+            })
+        return utils.print_table(data)
+
+    def start_service(self, on_restart=False):
+        """
+        Starts the migration checker in a background thread.
+        """
+        self._thread = threading.Thread(
+            target=self.check_status_migration, args=(on_restart,), daemon=True
+        )
+        self._thread.start()
+
+    def stop_service(self):
+        """
+        Stops the background service gracefully.
+        """
+        self._stop_event.set()
+        self._thread.join()
 
diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py
index 97a5c3ef0..95e329130 100644
--- a/simplyblock_core/controllers/snapshot_controller.py
+++ b/simplyblock_core/controllers/snapshot_controller.py
@@ -108,6 +108,7 @@ def add(lvol_id, snapshot_name):
     blobid = 0
     snap_uuid = ""
     used_size = 0
+    node_id = lvol.node_id
 
     if lvol.ha_type == "single":
         if snode.status == StorageNode.STATUS_ONLINE:
@@ -223,6 +224,7 @@ def add(lvol_id, snapshot_name):
     snap.lvol = lvol
     snap.fabric = lvol.fabric
     snap.vuid = snap_vuid
+    snap.node_id = node_id
     snap.status = SnapShot.STATUS_ONLINE
 
     snap.write_to_db(db_controller.kv_store)
diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py
index 277d1b68a..5cf48e976 100644
--- a/simplyblock_core/db_controller.py
+++ b/simplyblock_core/db_controller.py
@@ -8,6 +8,7 @@
 from simplyblock_core.models.cluster import Cluster
 from simplyblock_core.models.events import EventObj
 from simplyblock_core.models.job_schedule import JobSchedule
+from simplyblock_core.models.lvol_migration import MigrationObject
 from simplyblock_core.models.lvol_model import LVol
 from simplyblock_core.models.mgmt_node import MgmtNode
 from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
@@ -19,8 +20,6 @@
     PoolStatObject, CachedLVolStatObject
 from simplyblock_core.models.storage_node import StorageNode
 
-
-
 class Singleton(type):
     _instances = {}  # type: ignore
     def __call__(cls, *args, **kwargs):
@@ -32,8 +31,6 @@ def __call__(cls, *args, **kwargs):
                 cls._instances[cls] = ins
             return ins
 
-
-
 class DBController(metaclass=Singleton):
 
     kv_store=None
@@ -270,7 +267,7 @@ def get_snapshots_by_node_id(self, node_id) -> List[SnapShot]:
         ret = []
         snaps = SnapShot().read_from_db(self.kv_store)
         for snap in snaps:
-            if snap.lvol.node_id == node_id:
+            if snap.node_id == node_id:
                 ret.append(snap)
         return ret
 
@@ -309,3 +306,10 @@ def get_qos(self, cluster_id=None) -> List[QOSClass]:
         else:
             classes = QOSClass().read_from_db(self.kv_store)
         return sorted(classes, key=lambda x: x.class_id)
+
+    def get_migrations(self) -> List[MigrationObject]:
+        ret = MigrationObject().read_from_db(self.kv_store)
+        migrations = []
+        for m in ret:
+            migrations.append(m)
+        return sorted(migrations, key=lambda x: x.create_dt)
diff --git a/simplyblock_core/models/lvol_migration.py b/simplyblock_core/models/lvol_migration.py
index fd204890a..e4dc477bc 100644
--- a/simplyblock_core/models/lvol_migration.py
+++ b/simplyblock_core/models/lvol_migration.py
@@ -5,6 +5,7 @@
 import uuid
 import asyncio
 import storage_node
+from base_model import *
 
 
 # ---------------------------------------------------------------------------
@@ -15,29 +16,32 @@ class MigrationState(str, Enum):
     NEW = "new"
     PREPARING = "preparing"
     RUNNING = "running"
-    SUSPENDED = "suspended"
-    FAILED = "failed"
-    PARTIALLY_FAILED = "partially_failed"
-    DONE = "done"
-
-
-class StreamState(str, Enum):
-    NEW = "new"
-    RUNNING = "running"
-    SUSPENDED = "suspended"
-    FAILED = "failed"
+    SNAPS_MIGRATED = "migrated"
+    TARGET_LVOL_CREATED = "target_lvol_created"
+    HUBLVOL_CONNECTED = "hublvol_connecte"
+    TRANSFERRED_TO_TARGET = "transferred_to_target"
+    RECONNECT_DONE = "reconnect_done"
     CLEANUP = "cleanup"
+    FAILED = "failed"
+    FAILED_AND_CLEANED = "failed_and_cleaned"
     DONE = "done"
 
-
 class ObjectMigrationState(str, Enum):
     NEW = "new"
-    RUNNING = "running"
-    SUSPENDED = "suspended"
-    CANCELED = "failed"
+    LVOL_CREATED = "lvolcreated"
+    MIG_FLAG_SET = "migflagset"
+    NAMESPACE_CREATED = "nscreated"
+    NQN_CREATED = "nqncreated"
+    LVOL_CONNECTED = "lvolconnected"
+    LVOL_EXPORTED = "lvol_exported"
+    TRANSFER = "transferring"
+    RETRANSFER = "retransfer"
+    TRANSFERRED = "transferred"
+    CONVERTED = "converted"
+    CLEANING = "cleaning"
+    FAILED = "failed"
     DONE = "done"
 
-
 # ---------------------------------------------------------------------------
 # DATA MODELS
 # ---------------------------------------------------------------------------
@@ -45,68 +49,63 @@ class ObjectMigrationState(str, Enum):
 @dataclass
 class LogicalVolumeRef:
     """Reference to a logical volume participating in a migration."""
-
-    uuid: str
-    bdev_name: str  # "LVS/LV"
-    lvs_name: str
-    target_lvs_name: str
-    source_uuid: str
-    target_uuid: str
-    namespace_id: str
-    nqn : str
-    node_id: str
-    sec_node_id :str
-    target_node_id : str
-    target_sec_node_id : str
-    mapid: str
-    cloned : str
-    state : ObjectMigrationState
-    crypto_bdev_name: Optional[str] = None
+    uuid: str = ""
+    bdev_name: str = ""  # "LVS/LV"
+    lvs_name: str = ""
+    target_lvs_name: str = ""
+    source_uuid: str = ""
+    target_uuid: str = ""
+    namespace_id: str = ""
+    nqn : str = ""
+    node_id: str = ""
+    sec_node_id :str =""
+    target_node_id : str = ""
+    target_sec_node_id : str = ""
+    ndcs : int = 1
+    npcs : int = 1
+    priority_class : int = 0
+    size : int = 0
+    mapid: int = 0
+    cloned : str = ""
+    state : ObjectMigrationState = ObjectMigrationState.NEW
+    retry : int = 0
+    crypto_bdev_name: str = ""
 
 @dataclass
 class Snapshot:
-    """
-    Global snapshot object, exists only once.
-    Stores all per-snapshot migration metadata.
-    """
-    uuid : str
-    bdev_name: str  # "LVS/LV"
-    lvs_name: str
-    target_lvs_name : str
-    source_uuid : str
-    target_uuid : Optional[str] = None
-
+    uuid : str =""
+    bdev_name: str = "" # "LVS/LV"
+    lvs_name: str = ""
+    size: int = 0
+    target_lvs_name : str = ""
+    source_uuid : str = ""
+    target_uuid : str = ""
+    retry : int = 0
     # Migration metadata
-    temporary_nqn: Optional[str] = None
-    temporary_namespace: Optional[str] = None
-    mapid: Optional[str] = None
+    temporary_nqn: str = ""
+    temporary_namespace: str = ""
+    mapid: int = 0
     status: ObjectMigrationState = ObjectMigrationState.NEW
 
 @dataclass
-class SnapshotRef:
-    """Per-stream linked list node referencing a global snapshot."""
-    snapshot: Snapshot
-    next: Optional["SnapshotRef"] = None
-
-@dataclass
-class MigrationObject:
+class MigrationObject(BaseModel):
     """
     Full migration object, containing multiple streams and logical volumes.
     Snapshots exist independently and are referenced by streams.
     """
+    status: MigrationState = MigrationState.NEW
+    pre_status: MigrationState = MigrationState.NEW
 
-    main_logical_volume : LogicalVolumeRef
-    node_pri : storage_node.StorageNode
-    node_sec: storage_node.StorageNode
-    target_node_pri: storage_node.StorageNode
-    target_node_sec: storage_node.StorageNode
+    main_logical_volume : LogicalVolumeRef = None
+    node_pri : storage_node.StorageNode = None
+    node_sec: storage_node.StorageNode = None
+    target_node_pri: storage_node.StorageNode = None
+    target_node_sec: storage_node.StorageNode = None
 
     # Global snapshot objects (shared across streams)
-    snapshots: List[SnapshotRef]
+    snapshots: List[Snapshot] = None
     # Async queue for polling migration completion (set externally)
-    completion_poll_queue: Optional[asyncio.Queue] = None
-
-    id: str = field(default_factory=lambda: str(uuid.uuid4()))
-    status: MigrationState = MigrationState.NEW
+    completion_poll_queue: List[Snapshot] = None
+    rerun : int = 0
 
 
diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py
index 65448038a..fdbdde8ea 100644
--- a/simplyblock_core/models/snapshot.py
+++ b/simplyblock_core/models/snapshot.py
@@ -30,3 +30,4 @@ class SnapShot(BaseModel):
     status: str = ""
     fabric: str = "tcp"
     frozen: bool = False
+    node_id : str = ""
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index 3d32dd17a..4593ae97b 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -22,10 +22,12 @@
 from simplyblock_core.constants import LINUX_DRV_MASS_STORAGE_NVME_TYPE_ID, LINUX_DRV_MASS_STORAGE_ID
 from simplyblock_core.controllers import lvol_controller, storage_events, snapshot_controller, device_events, \
     device_controller, tasks_controller, health_controller, tcp_ports_events, qos_controller
+from simplyblock_core.controllers.lvol_migration_controller import MigrationController
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.fw_api_client import FirewallClient
 from simplyblock_core.models.iface import IFace
 from simplyblock_core.models.job_schedule import JobSchedule
+from simplyblock_core.models.lvol_migration import MigrationState
 from simplyblock_core.models.lvol_model import LVol
 from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
 from simplyblock_core.models.snapshot import SnapShot
@@ -125,6 +127,15 @@ def connect_device(name: str, device: NVMeDevice, node: StorageNode, bdev_names:
 
     return bdev_name
 
+#if a node was rebooted during an ongoing migration,
+def restart_migration(node:StorageNode):
+    db_controller = DBController()
+    migs=db_controller.get_migrations()
+    for m in migs:
+        if m.node_pri==node.uuid:
+          if m.status!=MigrationState.DONE:
+            #TODO: continue to run that migration by enabling the migration service.
+    return
 
 def get_next_cluster_device_order(db_controller, cluster_id):
     max_order = 0
@@ -2022,6 +2033,7 @@ def restart_storage_node(
                     online_devices_list.append(dev.get_id())
             if online_devices_list:
                 tasks_controller.add_device_mig_task(online_devices_list, snode.cluster_id)
+            restart_migration(snode)
             return True
 
 

From 85a9489b2ed814f58cfa5fa0699c91e358ab21a5 Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Tue, 25 Nov 2025 09:54:59 +0300
Subject: [PATCH 25/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 .../controllers/lvol_migration_controller.py  | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
index 88da79191..d53d24d00 100644
--- a/simplyblock_core/controllers/lvol_migration_controller.py
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -2,7 +2,6 @@
 from logging import exception
 from time import sleep
 
-from ..cluster_ops import db_controller
 from ..models.lvol_migration import *
 from dataclasses import dataclass
 from typing import Optional
@@ -102,7 +101,7 @@ def lvol_assign(self, lvol:LVol, target_lvs: str):
         m.main_logical_volume.cloned = lvol.cloned_from_snap
         return m.main_logical_volume
 
-    def snap_assign(lvol: LogicalVolumeRef, snap: SnapShot, target_lvs: str):
+    def snap_assign(self, lvol: LogicalVolumeRef, snap: SnapShot, target_lvs: str):
         s = Snapshot()
         s.retry = 0
         s.status = ObjectMigrationState.NEW
@@ -152,22 +151,6 @@ def unfreeze_objects(self):
             s.write_to_db(db_controller.kv_store)
         return
 
-    def get_transfer_state(self, lvolname: str, node_id: str):
-
-        return
-
-    def export_lvol(self, s: Snapshot):
-        # create subsystem
-        # create listener
-        # create namespace
-        return
-
-    def delete_tmp_nqn(self, s: Snapshot):
-        return
-
-    def get_lvol_by_name(self, lvol_name):
-        return
-
     def create_lvol(self, snap: Snapshot):
             name = snap.target_lvs_name + "/" + snap.bdev_name
             if snap.status == ObjectMigrationState.NEW:
@@ -195,6 +178,22 @@ def set_mig_status(self, snap: Snapshot):
                     self.m.write_to_db(self.db_controller.kv_store)
             return True
 
+    def get_transfer_state(self, lvolname: str, node_id: str):
+
+        return
+
+    def export_lvol(self, s: Snapshot):
+        # create subsystem
+        # create listener
+        # create namespace
+        return
+
+    def delete_tmp_nqn(self, s: Snapshot):
+        return
+
+    def get_lvol_by_name(self, lvol_name):
+        return
+
     def connect_lvol(self, s: Snapshot):
         return
 
@@ -241,7 +240,6 @@ def cleanup_migration(self, status: bool):
               return
         else:
               return
-        return
 
     def migrate_final_lvol(self):
       try:
@@ -341,7 +339,7 @@ def lvol_migrate(self, lvol: LogicalVolumeRef, target_node: StorageNode, m: Migr
                     s.frozen = True
                     # need to reset that one on node restart
                     s.write_to_db(self.db_controller.kv_store)
-                    sr = self.snap_assign(self.m.main_logical_volume, s,  self.db_controller.get(self.m.target_node_pri.lvstore)
+                    sr = self.snap_assign(self.m.main_logical_volume, s,  self.db_controller.get(self.m.target_node_pri.lvstore))
                     self.m.snapshots.append(sr)
         except:
             return False

From 43c97a52f08af1a10fe58bbc7e5c08405f91afdd Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 25 Nov 2025 14:00:05 +0300
Subject: [PATCH 26/68] Set cluster_id optional on SNodeAPI docker version
 (#777)

* Set cluster_id optional on SNodeAPI docker version

* fix type checker

* fix type checker
---
 simplyblock_core/snode_client.py                        | 2 +-
 simplyblock_web/api/internal/storage_node/kubernetes.py | 6 ++++++
 simplyblock_web/utils.py                                | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py
index 5e5f66f60..c9b40e878 100644
--- a/simplyblock_core/snode_client.py
+++ b/simplyblock_core/snode_client.py
@@ -73,7 +73,7 @@ def _request(self, method, path, payload=None):
         return None, None
 
     def is_live(self):
-        return self._request("GET", "/check")
+        return self._request("GET", "check")
 
     def info(self):
         return self._request("GET", "info")
diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py
index b6ab71b63..d5e98eb1d 100644
--- a/simplyblock_web/api/internal/storage_node/kubernetes.py
+++ b/simplyblock_web/api/internal/storage_node/kubernetes.py
@@ -492,6 +492,9 @@ def spdk_process_kill(query: utils.RPCPortParams):
     k8s_core_v1 = core_utils.get_k8s_core_client()
     try:
         namespace = node_utils_k8s.get_namespace()
+        if not query.cluster_id:
+            return utils.get_response(False, "param required: cluster_id")
+
         first_six_cluster_id = core_utils.first_six_chars(query.cluster_id)
         pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}"
         resp = k8s_core_v1.delete_namespaced_pod(pod_name, namespace)
@@ -555,6 +558,9 @@ def _is_pod_present(rpc_port, cluster_id):
     })}}},
 })
 def spdk_process_is_up(query: utils.RPCPortParams):
+    if not query.cluster_id:
+        return utils.get_response(False, "param required: cluster_id")
+
     first_six_cluster_id = core_utils.first_six_chars(query.cluster_id)
     if _is_pod_up(query.rpc_port, first_six_cluster_id):
         return utils.get_response(True)
diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py
index 27ff2ce18..a610cd177 100644
--- a/simplyblock_web/utils.py
+++ b/simplyblock_web/utils.py
@@ -149,7 +149,7 @@ def error_handler(exception: Exception):
 
 class RPCPortParams(BaseModel):
     rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536)
-    cluster_id: str
+    cluster_id: Optional[str]
 
 
 class DeviceParams(BaseModel):

From 33ee3e4288fbe4edde57945860091dcef39ca77c Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 26 Nov 2025 12:40:05 +0100
Subject: [PATCH 27/68] add cluster_id param for spdk_process_is_up (#779)

* add cluster_id param for spdk_process_is_up

* update image tag

* update image tag

* update env image tag to main
---
 simplyblock_core/controllers/health_controller.py | 4 ++--
 simplyblock_core/env_var                          | 4 ++--
 simplyblock_core/services/storage_node_monitor.py | 2 +-
 simplyblock_core/snode_client.py                  | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/simplyblock_core/controllers/health_controller.py b/simplyblock_core/controllers/health_controller.py
index c013e2d58..94855f111 100644
--- a/simplyblock_core/controllers/health_controller.py
+++ b/simplyblock_core/controllers/health_controller.py
@@ -128,11 +128,11 @@ def _check_node_api(ip):
     return False
 
 
-def _check_spdk_process_up(ip, rpc_port):
+def _check_spdk_process_up(ip, rpc_port, cluster_id):
     try:
         snode_api = SNodeClient(f"{ip}:5000", timeout=10, retry=2)
         logger.debug(f"Node API={ip}:5000")
-        is_up, _ = snode_api.spdk_process_is_up(rpc_port)
+        is_up, _ = snode_api.spdk_process_is_up(rpc_port, cluster_id)
         logger.debug(f"SPDK is {is_up}")
         return is_up
     except Exception as e:
diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var
index 468ba7a02..f34a430a9 100644
--- a/simplyblock_core/env_var
+++ b/simplyblock_core/env_var
@@ -1,6 +1,6 @@
 SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev
-SIMPLY_BLOCK_VERSION=19.2.25
+SIMPLY_BLOCK_VERSION=19.2.27
 
-SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main-lvol-sync-delete
+SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main
 SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest
 
diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py
index 17a7d0369..bfb92c11b 100644
--- a/simplyblock_core/services/storage_node_monitor.py
+++ b/simplyblock_core/services/storage_node_monitor.py
@@ -289,7 +289,7 @@ def node_rpc_timeout_check_and_report(node):
             spdk_process = False
             if node_api_check:
                 # 3- check spdk_process
-                spdk_process = health_controller._check_spdk_process_up(snode.mgmt_ip, snode.rpc_port)
+                spdk_process = health_controller._check_spdk_process_up(snode.mgmt_ip, snode.rpc_port, snode.cluster_id)
             logger.info(f"Check: spdk process {snode.mgmt_ip}:5000 ... {spdk_process}")
 
                 # 4- check rpc
diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py
index c9b40e878..6f1bee0db 100644
--- a/simplyblock_core/snode_client.py
+++ b/simplyblock_core/snode_client.py
@@ -154,8 +154,8 @@ def bind_device_to_spdk(self, device_pci):
         params = {"device_pci": device_pci}
         return self._request("POST", "bind_device_to_spdk", params)
 
-    def spdk_process_is_up(self, rpc_port):
-        params = {"rpc_port": rpc_port}
+    def spdk_process_is_up(self, rpc_port, cluster_id):
+        params = {"rpc_port": rpc_port, "cluster_id": cluster_id}
         return self._request("GET", "spdk_process_is_up", params)
 
     def get_file_content(self, file_name):

From 2531483bf224ddb6fbec886de3164f667d21e7fd Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Thu, 27 Nov 2025 09:43:34 +0100
Subject: [PATCH 28/68] updated images for openshift preflight check (#741)

* updated images for openshift preflight check

* added Lincense

* updated maintainer

* fixed cyclic terminfo symlink

* check that the directory exist

* create rm directory

* remove rm directory
---
 docker/Dockerfile | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index ce1a83ae1..1e1f8c3bd 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,12 +1,33 @@
 # syntax=docker/dockerfile:1
 FROM simplyblock/simplyblock:base_image
 
+LABEL name="simplyblock"
+LABEL vendor="Simplyblock"
+LABEL version="1.0.0"
+LABEL release="1"
+LABEL summary="Simplyblock controlplane plane component"
+LABEL description="Simplyblock controlplane plane container"
+LABEL maintainer="developers@simplyblock.io"
+
+COPY LICENSE /licenses/LICENSE
+
 WORKDIR /app
 
 COPY requirements.txt .
 
-RUN pip3 install -r requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
+
 
 COPY . /app
 
 RUN python setup.py install
+
+RUN if [ -d /usr/share/terminfo ]; then \
+       find /usr/share/terminfo -lname '*ncr260vt300wpp*' -exec rm -f {} + ; \
+       rm -f /usr/share/terminfo/n/ncr260vt300wpp || true ; \
+    fi
+
+RUN useradd -u 1001 -r -g 0 -d /app -s /sbin/nologin simplyblock && \
+    chown -R 1001:0 /app
+
+USER 1001

From 36f45b95d87d0e3c7a496ed6513eab4788bf4dea Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Thu, 27 Nov 2025 22:56:10 +0100
Subject: [PATCH 29/68] added graylog env GRAYLOG_MESSAGE_JOURNAL_MAX_SIZE
 (#782)

---
 simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml
index 9c0f46e1f..1349a33a9 100644
--- a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml
+++ b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml
@@ -68,6 +68,8 @@ spec:
               value: "false"
             - name: GRAYLOG_ELASTICSEARCH_REPLICAS
               value: "1"
+            - name: GRAYLOG_MESSAGE_JOURNAL_MAX_SIZE
+              value: "10gb"
           ports:
             - containerPort: 5044
             - containerPort: 5140

From 699d664813e44f54e75193ea89dbb55ba084fb59 Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Tue, 2 Dec 2025 00:09:47 +0300
Subject: [PATCH 30/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 .../controllers/lvol_migration_controller.py  | 349 ++++++++++++------
 .../controllers/snapshot_controller.py        | 268 ++++++++------
 simplyblock_core/models/base_model.py         |  26 +-
 simplyblock_core/models/lvol_migration.py     |  63 ++--
 simplyblock_core/models/lvol_model.py         |   1 +
 simplyblock_core/models/snapshot.py           |  13 +
 6 files changed, 456 insertions(+), 264 deletions(-)

diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
index d53d24d00..5aefd49d0 100644
--- a/simplyblock_core/controllers/lvol_migration_controller.py
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -2,6 +2,9 @@
 from logging import exception
 from time import sleep
 
+from jc.parsers.asn1crypto.core import Boolean
+
+from ..cluster_ops import db_controller
 from ..models.lvol_migration import *
 from dataclasses import dataclass
 from typing import Optional
@@ -23,6 +26,11 @@
 # Migration Service
 # ---------------------------------------------------------------------------
 
+def generate_nqn():
+    random_uuid = str(uuid.uuid4())
+    nqn = f"nqn.2024-01.io.simplyblock:tmp:{random_uuid}"
+    return nqn
+
 class MigrationQueueObjectType:
     SNAPSHOT = "snapshot"
     CLONE = "clone"
@@ -53,6 +61,7 @@ def reset(self):
         self.objects.clear()
 
 
+
 class MigrationService:
     """Service containing core migration logic."""
 
@@ -66,6 +75,102 @@ class MigrationService:
 # ---------------------------------------------------------------------------
 
 
+def get_lvol_by_name(lvol_name):
+    return LVol
+
+
+def snap_assign(lvol: LogicalVolumeRef, snap: SnapShot, target_lvs: str):
+    s = Snapshot()
+    s.retry = 0
+    s.status = ObjectMigrationState.NEW
+    s.bdev_name = snap.snap_bdev.split("/", 1)[1]
+    s.lvs_name = lvol.lvs_name
+    s.lvol_size = snap.size
+    s.target_lvs_name = lvol.target_lvs_name
+    s.target_lvs_name = target_lvs
+    s.uuid = snap.uuid
+    s.source_uuid = snap.snap_uuid
+    return s
+
+
+def snap_init(uuid: str, lvol: LogicalVolumeRef, target_lvs: str):
+    s = Snapshot()
+    s.retry = 0
+    s.status = ObjectMigrationState.NEW
+    s.bdev_name = "MIG_SNAP"
+    s.lvs_name = lvol.lvs_name
+    s.lvol_size = lvol.size
+    s.target_lvs_name = lvol.target_lvs_name
+    s.target_lvs_name = target_lvs
+    s.uuid = uuid
+    s.source_uuid = uuid
+    return s
+
+
+def check_nodes_online(n1: StorageNode, n2: StorageNode, n3: StorageNode, n4: StorageNode):
+    if (n1.status == StorageNode.STATUS_ONLINE and
+            n2.status == StorageNode.STATUS_ONLINE and
+            n3.status == StorageNode.STATUS_ONLINE and
+            n4.status == StorageNode.STATUS_ONLINE):
+        return True
+    return False
+
+
+def delete_hub_lvol_controller():
+    return -1
+
+
+def lvol_assign(lvol:LVol, target_lvs: str):
+    m=MigrationObject()
+    m.main_logical_volume.state = ObjectMigrationState.NEW
+
+    #unique identifier:
+    m.main_logical_volume.retry=0
+    m.main_logical_volume.uuid = lvol.uuid
+    m.main_logical_volume.bdev_name = lvol.lvol_bdev
+    m.main_logical_volume.lvs_name = lvol.lvs_name
+    m.main_logical_volume.target_lvs_name = target_lvs
+    m.main_logical_volume.nqn = lvol.nqn
+    m.main_logical_volume.source_uuid = lvol.lvol_uuid
+    m.main_logical_volume.node_id = lvol.hostname
+    if lvol.crypto_bdev != "":
+       m.main_logical_volume.crypto_bdev_name = lvol.crypto_bdev
+    m.main_logical_volume.mapid = 0
+    m.main_logical_volume.size = lvol.size
+    m.main_logical_volume.ndcs = lvol.ndcs
+    m.main_logical_volume.npcs = lvol.npcs
+    m.main_logical_volume.priority_class = lvol.lvol_priority_class
+    m.main_logical_volume.namespace_id = lvol.namespace
+    m.main_logical_volume.cloned = lvol.cloned_from_snap
+    return m.main_logical_volume
+
+
+def get_transfer_state(lvolname: str, node_id: str):
+    offset=0
+    return -1,offset
+
+
+def create_snapshot(lvol: LogicalVolumeRef):
+        return -1,""
+
+
+def migrations_list():
+    db_controller = DBController()
+    migrations = db_controller.get_migrations()
+    data = []
+    for m in migrations:
+        logger.debug(m)
+        data.append({
+            "UUID": m.uuid,
+            "Lvol UUID": m.main_logical_volume.uuid,
+            "Primary (source):": m.node_pri,
+            "Primary (target):": m.target_node_pri,
+            "DateTime:": m.create_dt,
+            "Status": m.status,
+        })
+    return utils.print_table(data)
+
+
 class MigrationController:
     """Controller orchestrates LVOL migrations."""
 
@@ -77,68 +182,15 @@ def __init__(self):
         self.db_controller = DBController()
         self.prev_time = datetime.now()
 
-    def lvol_assign(self, lvol:LVol, target_lvs: str):
-        m=MigrationObject()
-        m.main_logical_volume.state = ObjectMigrationState.NEW
-
-        #unique identifier:
-        m.main_logical_volume.retry=0
-        m.main_logical_volume.uuid = lvol.uuid
-        m.main_logical_volume.bdev_name = lvol.lvol_bdev
-        m.main_logical_volume.lvs_name = lvol.lvs_name
-        m.main_logical_volume.target_lvs_name = target_lvs
-        m.main_logical_volume.nqn = lvol.nqn
-        m.main_logical_volume.source_uuid = lvol.lvol_uuid
-        m.main_logical_volume.node_id = lvol.hostname
-        if lvol.crypto_bdev != "":
-           m.main_logical_volume.crypto_bdev_name = lvol.crypto_bdev
-        m.main_logical_volume.mapid = 0
-        m.main_logical_volume.size = lvol.size
-        m.main_logical_volume.ndcs = lvol.ndcs
-        m.main_logical_volume.npcs = lvol.npcs
-        m.main_logical_volume.priority_class = lvol.lvol_priority_class
-        m.main_logical_volume.namespace_id = lvol.namespace
-        m.main_logical_volume.cloned = lvol.cloned_from_snap
-        return m.main_logical_volume
-
-    def snap_assign(self, lvol: LogicalVolumeRef, snap: SnapShot, target_lvs: str):
-        s = Snapshot()
-        s.retry = 0
-        s.status = ObjectMigrationState.NEW
-        s.bdev_name = snap.snap_bdev.split("/", 1)[1]
-        s.lvs_name = lvol.lvs_name
-        s.lvol_size = snap.size
-        s.target_lvs_name = lvol.target_lvs_name
-        s.target_lvs_name = target_lvs
-        s.uuid = snap.uuid
-        s.source_uuid = snap.snap_uuid
-        return s
-
-    def snap_init(self, uuid: str, lvol: LogicalVolumeRef, target_lvs: str):
-        s = Snapshot()
-        s.retry = 0
-        s.status = ObjectMigrationState.NEW
-        s.bdev_name = "MIG_SNAP"
-        s.lvs_name = lvol.lvs_name
-        s.lvol_size = lvol.size
-        s.target_lvs_name = lvol.target_lvs_name
-        s.target_lvs_name = target_lvs
-        s.uuid = uuid
-        s.source_uuid = uuid
-        return s
-
-
-    @property
-    def connect_client(node:StorageNode):
+    def connect_client(self, node:StorageNode):
         return RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1)
 
-    def check_nodes_online(self, n1: StorageNode, n2: StorageNode, n3: StorageNode, n4: StorageNode):
-        if (n1.status == StorageNode.STATUS_ONLINE and
-                n2.status == StorageNode.STATUS_ONLINE and
-                n3.status == StorageNode.STATUS_ONLINE and
-                n4.status == StorageNode.STATUS_ONLINE):
-            return True
-        return False
+    def connect_clients(self):
+        self.m.rpc_client1 = self.connect_client(self.m.node_pri)
+        self.m.rpc_client2 = self.connect_client(self.m.node_sec)
+        self.m.rpc_client3 = self.connect_client(self.m.target_node_pri)
+        self.m.rpc_client4 = self.connect_client(self.m.target_node_sec)
+        return
 
     def unfreeze_objects(self):
         db_controller = DBController()
@@ -154,9 +206,9 @@ def unfreeze_objects(self):
     def create_lvol(self, snap: Snapshot):
             name = snap.target_lvs_name + "/" + snap.bdev_name
             if snap.status == ObjectMigrationState.NEW:
-                snap_uuid=self.get_lvol_by_name(name)
+                snap_uuid= get_lvol_by_name(name)
                 if not snap_uuid:
-                   snap_uuid = self.rpc_client2.create_lvol(name, snap.size, snap.target_lvs_name,
+                   snap_uuid = self.m.rpc_client2.create_lvol(name, snap.size, snap.target_lvs_name,
                                                          self.m.main_logical_volume.priority_class,
                                                          self.m.main_logical_volume.ndcs,
                                                          self.m.main_logical_volume.npcs)
@@ -171,30 +223,70 @@ def create_lvol(self, snap: Snapshot):
     def set_mig_status(self, snap: Snapshot):
             name = snap.target_lvs_name + "/" + snap.bdev_name
             if snap.status == ObjectMigrationState.LVOL_CREATED:
-                if not self.rpc_client2.lvol_set_migration_flag(name):
-                    raise (f"issue creating an target object during migration of snapshot {snap.uuid} ")
+                if not self.m.rpc_client2.lvol_set_migration_flag(name):
+                    raise f'issue creating an target object during migration of snapshot {snap.uuid} '
                 else:
                     snap.status = ObjectMigrationState.MIG_FLAG_SET
                     self.m.write_to_db(self.db_controller.kv_store)
             return True
 
-    def get_transfer_state(self, lvolname: str, node_id: str):
-
-        return
+    def check_online_and_leader(self, node: StorageNode):
+        if node.uuid==self.m.node_pri.uuid:
+            client=self.m.rpc_client1
+        elif node.uuid==self.m.target_node_pri.uuid:
+            client=self.m.rpc_client3
+        elif node.uuid==self.m.node_sec.uuid:
+            client=self.m.rpc_client2
+        elif node.uuid==self.m.target_node_sec.uuid:
+            client = self.m.rpc_client4
+        else:
+            raise f"migration: invalid node, cannot cleanup: {self.m.uuid}"
+        if node.status!=StorageNode.STATUS_ONLINE:
+            raise f"migration: node not online, cannot cleanup: {self.m.uuid}"
+        return client
 
     def export_lvol(self, s: Snapshot):
-        # create subsystem
-        # create listener
-        # create namespace
+        client = self.check_online_and_leader(self.m.node_pri)
+        nqn=generate_nqn()
+        client.subsystem_create(nqn,"tmp-mig", "sb-internal", 1, 1)
+        client.nvmf_subsystem_add_ns(s.temporary_nqn,s.lvs_name+"/"+s.bdev_name)
+        if self.m.target_node_pri.active_rdma:
+            fabric="RDMA"
+        else:
+            fabric="TCP"
+        client.nvmf_subsystem_add_listener(s.temporary_nqn, fabric,self.m.target_node_pri.nvmf_port,
+                    self.m.target_node_pri.hostname, "optimized")
         return
 
-    def delete_tmp_nqn(self, s: Snapshot):
+    #delete subystem only, if there is only zero or one namespaces left;
+    #if one namespace is left, it must match the volume
+    def delete_nqn_and_namespace(self, node: StorageNode, nqn:str, lvol: LVol):
+        client=self.check_online_and_leader(node)
+        data=client.subsystem_list(nqn)
+        for subsystem in data['result']:
+            # Check if the subsystem has namespaces
+            namespaces = subsystem.get('namespaces')
+            if len(namespaces)==1:
+               for ns in namespaces:
+                   if ns['nsid']==lvol.namespace:
+                       client.subsystem_delete(nqn)
+            if len(namespaces)==0:
+                client.subsystem_delete(nqn)
         return
 
-    def get_lvol_by_name(self, lvol_name):
-        return
 
     def connect_lvol(self, s: Snapshot):
+
+        return
+
+    def delete_lvol_from_node(self, node: StorageNode, oid: str, deleteType: Boolean):
+        client=self.check_online_and_leader(node)
+        lvol=db_controller.get_lvol_by_id(oid)
+        if lvol:
+           client.delete_lvol(lvol.lvs_name+"/"+lvol.lvol_name, deleteType)
+        else:
+           snap=db_controller.get_snapshot_by_id(oid)
+           client.delete_lvol(snap.lvol.lvs_name + "/" + snap.lvol.lvol_name, deleteType)
         return
 
     def transfer_data(self, snap: Snapshot, offset: int):
@@ -202,14 +294,13 @@ def transfer_data(self, snap: Snapshot, offset: int):
             return
 
     def convert_lvol(self, s: Snapshot):
+            client=self.check_online_and_leader(self.m.target_node_pri)
+            client.
             return
 
     def convert_to_snap(self, s1, s2: Snapshot):
             return
 
-    def create_snapshot(self, lvol: LogicalVolumeRef):
-            return
-
     def time_difference(self):
            return (datetime.now()-self.prev_time).total_seconds()
 
@@ -236,10 +327,36 @@ def cleanup_migration(self, status: bool):
         real_snapshots = db_controller.get_snapshots()
         self.unfreeze_objects()
         #Migration was not successful
-        if not status:
-              return
-        else:
-              return
+        try:
+          if self.m.status >= MigrationState.HUBLVOL_CONNECTED:
+              ret= delete_hub_lvol_controller()
+          if not status:
+              pri_node=self.m.node_pri
+              sec_node=self.m.node_sec
+          else:
+              pri_node = self.m.target_node_pri
+              sec_node = self.m.target_node_sec
+
+          if (self.m.status >= MigrationState.TARGET_LVOL_CREATED and not status) or self.m.status == MigrationState.DONE:
+              self.delete_nqn_and_namespace(pri_node, self.m.main_logical_volume.uuid)
+              self.delete_nqn_and_namespace(sec_node, self.m.main_logical_volume.uuid)
+              self.delete_lvol_from_node(pri_node, self.m.main_logical_volume.uuid)
+              self.unregister_lvol_from_node(sec_node, self.m.main_logical_volume.uuid)
+
+          snaps = self.m.snapshots
+          snaps.reverse()
+          for sn in snaps:
+                     if sn.uuid:
+                        rsn = db_controller.get_snapshot_by_id(sn.uuid)
+                        if len(rsn.successor)==1:
+                            self.delete_lvol_from_node(pri_node, sn.uuid)
+                            self.delete_nqn_and_namespace(pri_node,sn.uuid)
+                            self.delete_lvol_from_node(sec_node, sn.uuid)
+                        else:
+                            break
+        except:
+            raise f"cleanup of migration not successful, will try later {self.m.uuid}"
+        return True
 
     def migrate_final_lvol(self):
       try:
@@ -254,10 +371,9 @@ def migrate_final_lvol(self):
         elif self.m.status == MigrationState.RECONNECT_DONE:
            self.cleanup_migration(True)
       except:
-        raise (f"cannot transfer to target: {self.m.main_logical_volume.uuid}")
+        raise f"cannot transfer to target: {self.m.main_logical_volume.uuid}"
       return True
 
-
     def migrate_snaps(self):
         if self.m.status==MigrationState.RUNNING:
           try:
@@ -279,11 +395,13 @@ def migrate_snaps(self):
               elif s.status==ObjectMigrationState.TRANSFERRED:
                    self.convert_to_snap(s,p)
               elif s.status == ObjectMigrationState.CONVERTED:
-                   self.delete_tmp_nqn(s)
+                   self.delete_nqn_and_namespace(self.m.target_node_pri,s.uuid)
+              elif s.status == ObjectMigrationState.CLEANING:
+                   self.delete_lvol_from_node(self.m.target_node_sec, s.uuid)
               p=s
             if self.m.rerun < 3 or self.time_difference()>5:
-                snap_uuid=self.create_snapshot(self.m.main_logical_volume)
-                sn=self.snap_init(snap_uuid,self.m.main_logical_volume,self.m.target_node_pri.lvstore)
+                ret, snap_uuid=create_snapshot(self.m.main_logical_volume)
+                sn= snap_init(snap_uuid, self.m.main_logical_volume, self.m.target_node_pri.lvstore)
                 self.m.snapshots.append(sn)
                 self.prev_time=datetime.now()
                 self.migrate_snaps()
@@ -310,24 +428,25 @@ def lvol_migrate(self, lvol: LogicalVolumeRef, target_node: StorageNode, m: Migr
         else:
             self.m = m
 
-        # update lvol: frozen means it cannot be deleted or resized. new snapshots cannot be taken.
+        # update lvol: frozen means it cannot be deleself.m.main_logical_volume ted or resized. new snapshots cannot be taken.
         try:
             lvol1=self.db_controller.get_lvol_by_id(lvol.uuid)
             lvol1.frozen = True
             lvol1.write_to_db(self.db_controller.kv_store)
 
             # copy now all data from the lvol to the migration lvol (temporary object for lvol during migration)
-            self.m.main_logical_volume = self.lvol_assign(lvol)
 
-            # get all 4 storage node objects: primary, secondary source and target
-            self.m.node_pri = StorageNode(self.db_controller.get_storage_node_by_id(self.m.main_logical_volume.node_id))
+            self.m.node_pri = StorageNode(self.db_controller.get_storage_node_by_id(lvol1.node_id))
             self.m.node_sec = self.db_controller.get_storage_node_by_id(self.m.node_pri.secondary_node_id)
             self.m.target_node_pri = target_node
             self.m.target_node_sec = self.db_controller.get_storage_node_by_id(self.m.target_node_pri.secondary_node_id)
 
+            self.m.main_logical_volume = lvol_assign(lvol1,self.m.node_pri.lvstore)
+
+            # get all 4 storage node objects: primary, secondary source and target
+
             # create rpc clients for both primaries:
-            self.rpc_client1 = self.connect_client
-            self.rpc_client2 = self.connect_client
+            self.connect_clients()
 
             # now we create a chain of snapshots from all snapshots taken from this lvol
             snapshots = self.db_controller.get_snapshots()
@@ -339,12 +458,12 @@ def lvol_migrate(self, lvol: LogicalVolumeRef, target_node: StorageNode, m: Migr
                     s.frozen = True
                     # need to reset that one on node restart
                     s.write_to_db(self.db_controller.kv_store)
-                    sr = self.snap_assign(self.m.main_logical_volume, s,  self.db_controller.get(self.m.target_node_pri.lvstore))
+                    sr = snap_assign(self.m.main_logical_volume, s,  self.m.target_node_pri.lvstore)
                     self.m.snapshots.append(sr)
         except:
             return False
 
-        if self.check_nodes_online(self.m.node_pri, self.m.node_sec, self.m.target_node_pri, self.m.target_node_sec):
+        if check_nodes_online(self.m.node_pri, self.m.node_sec, self.m.target_node_pri, self.m.target_node_sec):
             self.m.status = MigrationState.RUNNING
             self.m.write_to_db(self.db_controller.kv_store)
             self.migrate_snaps()
@@ -361,30 +480,30 @@ def check_status_migration(self, on_restart: bool):
             migrations=self.db_controller.get_migrations()
             for m in migrations:
               if m.status!=MigrationState.DONE and m.status!=MigrationState.FAILED:
-                 if self.check_nodes_online(m.node_pri,self.db_controller.get_storage_node_by_id(m.node_pri.secondary_node_id),
+                 if check_nodes_online(m.node_pri,self.db_controller.get_storage_node_by_id(m.node_pri.secondary_node_id),
                                             m.target_node_pri,m.target_node_sec):
-                     if (m.status==MigrationState.NEW):
+                     if m.status==MigrationState.NEW:
                          self.lvol_migrate(m.main_logical_volume,m.node_pri,m)
-                     elif (m.status==MigrationState.RUNNING):
+                     elif m.status==MigrationState.RUNNING:
                          for q in m.completion_poll_queue:
                              m.completion_poll_queue.remove(q)
-                             if (q.status==ObjectMigrationState.TRANSFER):
+                             if q.status==ObjectMigrationState.TRANSFER:
                                  if q.retry>5:
-                                     raise (f"could not transfer snapshot. max retries. name: {q.lvs_name+"/"+q.bdev_name}. uuid: {q.uuid}")
+                                     raise f"could not transfer snapshot. max retries. name: {q.lvs_name + "/" + q.bdev_name}. uuid: {q.uuid}"
                                  q.retry+=1
-                                 result=self.get_transfer_state(q.target_lvs_name+"/"+q.bdev_name)
-                                 if not result.status:
-                                    self.transfer_data(q,result.offset)
+                                 result, offset = get_transfer_state(q.target_lvs_name + "/" + q.bdev_name, self.m.node_pri.uuid)
+                                 if not result:
+                                    self.transfer_data(q,offset)
                                     m.completion_poll_queue.append(q)
                                  else:
                                     q.status=ObjectMigrationState.TRANSFERRED
-                             self.migrate_snaps
-                     elif (m.status in (MigrationState.SNAPS_MIGRATED, MigrationState.HUBLVOL_CONNECTED, MigrationState.TARGET_LVOL_CREATED, MigrationState.TRANSFERRED_TO_TARGET, MigrationState.RECONNECT_DONE)):
+                             self.migrate_snaps()
+                     elif m.status in (MigrationState.SNAPS_MIGRATED, MigrationState.HUBLVOL_CONNECTED, MigrationState.TARGET_LVOL_CREATED, MigrationState.TRANSFERRED_TO_TARGET, MigrationState.RECONNECT_DONE):
                           self.migrate_final_lvol()
           except:
-              logger.error(f"migration controller exception. Migration failed: {m.uuid} ")
-              m.status=MigrationState.FAILED
-              self.cleanup_migration(m, False)
+              logger.error(f"migration controller exception. Migration failed: {self.m.uuid} ")
+              self.m.status=MigrationState.FAILED
+              self.cleanup_migration(False)
               return False
           return True
 
@@ -409,23 +528,7 @@ def add_new_migration(self, lvol, target_node: StorageNode):
         #any snaps in queue?
         #poll for completion, trigger restart or if completed change the state
         #stop
-      return
-
-    def migrations_list(self):
-        db_controller = DBController()
-        migrations = db_controller.get_migrations()
-        data = []
-        for m in migrations:
-            logger.debug(m)
-            data.append({
-                "UUID": m.uuid,
-                "Lvol UUID": m.main_logical_volume.uuid,
-                "Primary (source):": m.node_pri,
-                "Primary (target):": m.target_node_pri,
-                "DateTime:": m.create_dt,
-                "Status": m.status,
-            })
-        return utils.print_table(data)
+      return None
 
     def start_service(self, on_restart=False):
         """
diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py
index 95e329130..2b5880e38 100644
--- a/simplyblock_core/controllers/snapshot_controller.py
+++ b/simplyblock_core/controllers/snapshot_controller.py
@@ -8,7 +8,7 @@
 from simplyblock_core import utils, constants
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.models.pool import Pool
-from simplyblock_core.models.snapshot import SnapShot
+from simplyblock_core.models.snapshot import SnapShot, SnapshotRef
 from simplyblock_core.models.lvol_model import LVol
 from simplyblock_core.models.storage_node import StorageNode
 from simplyblock_core.rpc_client import RPCClient
@@ -16,6 +16,8 @@
 import threading
 from collections import defaultdict
 
+from simplyblock_core.services.lvol_stat_collector import rpc_client
+
 # A dictionary to hold locks per node
 node_locks = defaultdict(threading.Lock)
 node_locks_global_lock = threading.Lock()  # protects the node_locks dict
@@ -46,14 +48,14 @@ def add(lvol_id, snapshot_name):
 
   node_lock = get_node_lock(lvol.node_id)
   with node_lock:
-
-    pool = db_controller.get_pool_by_id(lvol.pool_uuid)
-    if pool.status == Pool.STATUS_INACTIVE:
+    try:
+      pool = db_controller.get_pool_by_id(lvol.pool_uuid)
+      if pool.status == Pool.STATUS_INACTIVE:
         msg = "Pool is disabled"
         logger.error(msg)
         return False, msg
 
-    if lvol.cloned_from_snap:
+      if lvol.cloned_from_snap:
         snap = db_controller.get_snapshot_by_id(lvol.cloned_from_snap)
         ref_count = snap.ref_count
         if snap.snap_ref_id:
@@ -65,55 +67,57 @@ def add(lvol_id, snapshot_name):
             logger.error(msg)
             return False, msg
 
-    for sn in db_controller.get_snapshots():
+      for sn in db_controller.get_snapshots():
         if sn.cluster_id == pool.cluster_id:
             if sn.snap_name == snapshot_name:
                 return False, f"Snapshot name must be unique: {snapshot_name}"
 
-    logger.info(f"Creating snapshot: {snapshot_name} from LVol: {lvol.get_id()}")
-    snode = db_controller.get_storage_node_by_id(lvol.node_id)
-
-    rec = db_controller.get_lvol_stats(lvol, 1)
-    if rec:
+      logger.info(f"Creating snapshot: {snapshot_name} from LVol: {lvol.get_id()}")
+      snode = db_controller.get_storage_node_by_id(lvol.node_id)
+      rec = db_controller.get_lvol_stats(lvol, 1)
+      if rec:
         size = rec[0].size_used
-    else:
+      else:
         size = lvol.size
 
-    if 0 < pool.lvol_max_size < size:
+      if 0 < pool.lvol_max_size < size:
         msg = f"Pool Max LVol size is: {utils.humanbytes(pool.lvol_max_size)}, LVol size: {utils.humanbytes(size)} must be below this limit"
         logger.error(msg)
         return False, msg
 
-    if pool.pool_max_size > 0:
+      if pool.pool_max_size > 0:
         total = pool_controller.get_pool_total_capacity(pool.get_id())
         if total + size > pool.pool_max_size:
             msg =  f"Invalid LVol size: {utils.humanbytes(size)}. pool max size has reached {utils.humanbytes(total+size)} of {utils.humanbytes(pool.pool_max_size)}"
             logger.error(msg)
             return False, msg
 
-    if pool.pool_max_size > 0:
+      if pool.pool_max_size > 0:
         total = pool_controller.get_pool_total_capacity(pool.get_id())
         if total + lvol.size > pool.pool_max_size:
             msg = f"Pool max size has reached {utils.humanbytes(total)} of {utils.humanbytes(pool.pool_max_size)}"
             logger.error(msg)
             return False, msg
 
-    cluster = db_controller.get_cluster_by_id(pool.cluster_id)
-    if cluster.status not in [cluster.STATUS_ACTIVE, cluster.STATUS_DEGRADED]:
+      cluster = db_controller.get_cluster_by_id(pool.cluster_id)
+      if cluster.status not in [cluster.STATUS_ACTIVE, cluster.STATUS_DEGRADED]:
         return False, f"Cluster is not active, status: {cluster.status}"
 
-    snap_vuid = utils.get_random_snapshot_vuid()
-    snap_bdev_name = f"SNAP_{snap_vuid}"
-    size = lvol.size
-    blobid = 0
-    snap_uuid = ""
-    used_size = 0
-    node_id = lvol.node_id
+      snap_vuid = utils.get_random_snapshot_vuid()
+      snap_bdev_name = f"SNAP_{snap_vuid}"
+      size = lvol.size
+      blobid = 0
+      snap_uuid = ""
+      used_size = 0
+      node_id = lvol.node_id
+
 
-    if lvol.ha_type == "single":
+
+      if lvol.ha_type == "single":
         if snode.status == StorageNode.STATUS_ONLINE:
             rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
             logger.info("Creating Snapshot bdev")
+
             ret = rpc_client.lvol_create_snapshot(f"{lvol.lvs_name}/{lvol.lvol_bdev}", snap_bdev_name)
             if not ret:
                 return False, f"Failed to create snapshot on node: {snode.get_id()}"
@@ -130,7 +134,7 @@ def add(lvol_id, snapshot_name):
             logger.error(msg)
             return False, msg
 
-    if lvol.ha_type == "ha":
+      if lvol.ha_type == "ha":
         primary_node = None
         secondary_node = None
         host_node = db_controller.get_storage_node_by_id(snode.get_id())
@@ -210,41 +214,72 @@ def add(lvol_id, snapshot_name):
                     logger.error(f"Failed to delete snap from node: {snode.get_id()}")
                 return False, msg
 
-    snap = SnapShot()
-    snap.uuid = str(uuid.uuid4())
-    snap.snap_uuid = snap_uuid
-    snap.size = size
-    snap.used_size = used_size
-    snap.blobid = blobid
-    snap.pool_uuid = pool.get_id()
-    snap.cluster_id = pool.cluster_id
-    snap.snap_name = snapshot_name
-    snap.snap_bdev = f"{lvol.lvs_name}/{snap_bdev_name}"
-    snap.created_at = int(time.time())
-    snap.lvol = lvol
-    snap.fabric = lvol.fabric
-    snap.vuid = snap_vuid
-    snap.node_id = node_id
-    snap.status = SnapShot.STATUS_ONLINE
-
-    snap.write_to_db(db_controller.kv_store)
-
-    if lvol.cloned_from_snap:
+      snap = SnapShot()
+      snap.uuid = str(uuid.uuid4())
+      snap.snap_uuid = snap_uuid
+      snap.size = size
+      snap.used_size = used_size
+      snap.blobid = blobid
+      snap.pool_uuid = pool.get_id()
+      snap.cluster_id = pool.cluster_id
+      snap.snap_name = snapshot_name
+      snap.snap_bdev = f"{lvol.lvs_name}/{snap_bdev_name}"
+      snap.created_at = int(time.time())
+      snap.lvol = lvol
+      snap.fabric = lvol.fabric
+      snap.vuid = snap_vuid
+      snap.node_id = node_id
+      snap.status = SnapShot.STATUS_ONLINE
+      snap.predecessor = lvol.last_snapshot_uuid
+      snap.successor.append(SnapshotRef.TYPE_LVOL,lvol.uuid)
+      tr = db_controller.kv_store.create_transaction()
+      snap.write_to_db(db_controller.kv_store,tr)
+      pred = db_controller.get_snapshot_by_id(lvol.last_snapshot_uuid)
+      for p in pred.successor:
+        if p.next == lvol.uuid:
+            p.next = snap.uuid
+
+      pred.write_to_db(db_controller.kv_store,tr)
+
+      lvol.last_snapshot_uuid = snap.uuid
+      lvol.write_to_db(db_controller.kv_store,tr)
+
+      if lvol.cloned_from_snap:
         original_snap = db_controller.get_snapshot_by_id(lvol.cloned_from_snap)
         if original_snap:
             if original_snap.snap_ref_id:
                 original_snap = db_controller.get_snapshot_by_id(original_snap.snap_ref_id)
 
             original_snap.ref_count += 1
-            original_snap.write_to_db(db_controller.kv_store)
+            original_snap.write_to_db(db_controller.kv_store,tr)
             snap.snap_ref_id = original_snap.get_id()
-            snap.write_to_db(db_controller.kv_store)
+            snap.write_to_db(db_controller.kv_store,tr)
+
+      tr.commit().wait()
+
+    #still should move to asynchronous delete just in case, if fdb fails AND
+    #at the same time primary goes down we could have inconsistency
+    #also, if the webapi container dies exactly during execution, this can be a problem
+
+    except Exception as e:
+       try:
+         rpc_client = RPCClient(
+           primary_node.mgmt_ip, primary_node.rpc_port, primary_node.rpc_username, primary_node.rpc_password)
+         rpc_client.delete_lvol(f"{lvol.lvs_name}/{snap_bdev_name}")
+       except:
+           raise logger.error(f"exception creating creating snapshot: {snap.uuid}")
+       try:
+         sec_rpc_client = RPCClient(
+           secondary_node.mgmt_ip, secondary_node.rpc_port, secondary_node.rpc_username, secondary_node.rpc_password)
+         sec_rpc_client.delete_lvol(f"{lvol.lvs_name}/{snap_bdev_name}")
+       except:
+           raise logger.error(f"exception creating creating snapshot: {snap.uuid}")
+       raise logger.error(f"exception creating creating snapshot: {snap.uuid}")
 
     logger.info("Done")
     snapshot_events.snapshot_create(snap)
     return snap.uuid, False
 
-
 def list(all=False):
     snaps = db_controller.get_snapshots()
     data = []
@@ -383,16 +418,21 @@ def delete(snapshot_uuid, force_delete=False):
 
 
 def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None):
-    try:
+
+
+
+  try:
         snap = db_controller.get_snapshot_by_id(snapshot_id)
-    except KeyError as e:
+  except KeyError as e:
         logger.error(e)
         return False, str(e)
 
-    if snap.frozen:
+  if snap.frozen:
         logger.warning(f"lvol in migration. cannot create clone {snap.uuid}")
         return False
 
+  node_lock = get_node_lock(snap.lvol.node_id)
+  with node_lock:
     try:
         pool = db_controller.get_pool_by_id(snap.lvol.pool_uuid)
     except KeyError:
@@ -406,90 +446,97 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None
         return False, msg
 
     try:
-        snode = db_controller.get_storage_node_by_id(snap.lvol.node_id)
+         snode = db_controller.get_storage_node_by_id(snap.lvol.node_id)
     except KeyError:
         msg = 'Storage node not found'
         logger.exception(msg)
         return False, msg
 
-    cluster = db_controller.get_cluster_by_id(pool.cluster_id)
-    if cluster.status not in [cluster.STATUS_ACTIVE, cluster.STATUS_DEGRADED]:
+    try:
+      cluster = db_controller.get_cluster_by_id(pool.cluster_id)
+      if cluster.status not in [cluster.STATUS_ACTIVE, cluster.STATUS_DEGRADED]:
         return False, f"Cluster is not active, status: {cluster.status}"
 
-    ref_count = snap.ref_count
-    if snap.snap_ref_id:
+      ref_count = snap.ref_count
+      if snap.snap_ref_id:
         ref_snap = db_controller.get_snapshot_by_id(snap.snap_ref_id)
         ref_count = ref_snap.ref_count
 
-    if ref_count >= constants.MAX_SNAP_COUNT:
+      if ref_count >= constants.MAX_SNAP_COUNT:
         msg = f"Can not create more than {constants.MAX_SNAP_COUNT} clones from this snapshot"
         logger.error(msg)
         return False, msg
 
-    for lvol in db_controller.get_lvols():
+      for lvol in db_controller.get_lvols():
         if lvol.pool_uuid == pool.get_id():
             if lvol.lvol_name == clone_name:
                 msg=f"LVol name must be unique: {clone_name}"
                 logger.error(msg)
                 return False, msg
 
-    size = snap.size
-    if 0 < pool.lvol_max_size < size:
+      size = snap.size
+      if 0 < pool.lvol_max_size < size:
         msg = f"Pool Max LVol size is: {utils.humanbytes(pool.lvol_max_size)}, LVol size: {utils.humanbytes(size)} must be below this limit"
         logger.error(msg)
         return False, msg
 
-    if pool.pool_max_size > 0:
+      if pool.pool_max_size > 0:
         total = pool_controller.get_pool_total_capacity(pool.get_id())
         if total + size > pool.pool_max_size:
             msg =  f"Invalid LVol size: {utils.humanbytes(size)}. Pool max size has reached {utils.humanbytes(total+size)} of {utils.humanbytes(pool.pool_max_size)}"
             logger.error(msg)
             return False, msg
 
-    lvol_count = len(db_controller.get_lvols_by_node_id(snode.get_id()))
-    if lvol_count >= snode.max_lvol:
+      lvol_count = len(db_controller.get_lvols_by_node_id(snode.get_id()))
+      if lvol_count >= snode.max_lvol:
         error = f"Too many lvols on node: {snode.get_id()}, max lvols reached: {lvol_count}"
         logger.error(error)
         return False, error
 
-    if pool.pool_max_size > 0:
+      if pool.pool_max_size > 0:
         total = pool_controller.get_pool_total_capacity(pool.get_id())
         if total + snap.lvol.size > pool.pool_max_size:
             msg = f"Pool max size has reached {utils.humanbytes(total)} of {utils.humanbytes(pool.pool_max_size)}"
             logger.error(msg)
             return False, msg
 
-    lvol = LVol()
-    lvol.uuid = str(uuid.uuid4())
-    lvol.lvol_name = clone_name
-    lvol.size = snap.lvol.size
-    lvol.max_size = snap.lvol.max_size
-    lvol.base_bdev = snap.lvol.base_bdev
-    lvol.lvol_bdev = f"CLN_{utils.get_random_vuid()}"
-    lvol.lvs_name = snap.lvol.lvs_name
-    lvol.top_bdev = f"{lvol.lvs_name}/{lvol.lvol_bdev}"
-    lvol.hostname = snode.hostname
-    lvol.node_id = snode.get_id()
-    lvol.nodes = snap.lvol.nodes
-    lvol.mode = 'read-write'
-    lvol.cloned_from_snap = snapshot_id
-    lvol.nqn = cluster.nqn + ":lvol:" + lvol.uuid
-    lvol.pool_uuid = pool.get_id()
-    lvol.ha_type = snap.lvol.ha_type
-    lvol.lvol_type = 'lvol'
-    lvol.guid = utils.generate_hex_string(16)
-    lvol.vuid = snap.lvol.vuid
-    lvol.snapshot_name = snap.snap_bdev
-    lvol.subsys_port = snap.lvol.subsys_port
-    lvol.fabric = snap.fabric
-
-    if pvc_name:
+      lvol = LVol()
+      lvol.uuid = str(uuid.uuid4())
+      lvol.lvol_name = clone_name
+      lvol.size = snap.lvol.size
+      lvol.max_size = snap.lvol.max_size
+      lvol.base_bdev = snap.lvol.base_bdev
+      lvol.lvol_bdev = f"CLN_{utils.get_random_vuid()}"
+      lvol.lvs_name = snap.lvol.lvs_name
+      lvol.top_bdev = f"{lvol.lvs_name}/{lvol.lvol_bdev}"
+      lvol.hostname = snode.hostname
+      lvol.node_id = snode.get_id()
+      lvol.nodes = snap.lvol.nodes
+      lvol.mode = 'read-write'
+      lvol.cloned_from_snap = snapshot_id
+      lvol.nqn = cluster.nqn + ":lvol:" + lvol.uuid
+      lvol.pool_uuid = pool.get_id()
+      lvol.ha_type = snap.lvol.ha_type
+      lvol.lvol_type = 'lvol'
+      lvol.guid = utils.generate_hex_string(16)
+      lvol.vuid = snap.lvol.vuid
+      lvol.snapshot_name = snap.snap_bdev
+      lvol.subsys_port = snap.lvol.subsys_port
+      lvol.fabric = snap.fabric
+      lvol.last_snapshot_uuid=snap.snap_uuid
+
+      su = SnapshotRef()
+      su.type = SnapshotRef.TYPE_CLONE
+      su.next = lvol.uuid
+      snap.successor.append(su)
+
+      if pvc_name:
         lvol.pvc_name = pvc_name
-    if pvc_namespace:
+      if pvc_namespace:
         lvol.namespace = pvc_namespace
 
-    lvol.status = LVol.STATUS_IN_CREATION
-    lvol.bdev_stack = [
+      lvol.status = LVol.STATUS_IN_CREATION
+      lvol.bdev_stack = [
         {
             "type": "bdev_lvol_clone",
             "name": lvol.top_bdev,
@@ -498,9 +545,9 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None
                 "clone_name": lvol.lvol_bdev
             }
         }
-    ]
+      ]
 
-    if snap.lvol.crypto_bdev:
+      if snap.lvol.crypto_bdev:
         lvol.crypto_bdev = f"crypto_{lvol.lvol_bdev}"
         lvol.bdev_stack.append({
             "type": "crypto",
@@ -517,7 +564,7 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None
         lvol.crypto_key1 = snap.lvol.crypto_key1
         lvol.crypto_key2 = snap.lvol.crypto_key2
 
-    if new_size:
+      if new_size:
         if snap.lvol.size >= new_size:
             msg = f"New size {new_size} must be higher than the original size {snap.lvol.size}"
             logger.error(msg)
@@ -529,17 +576,21 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None
             return False, msg
         lvol.size = new_size
 
-    lvol.write_to_db(db_controller.kv_store)
+      tr = db_controller.kv_store.create_transaction()
+      lvol.write_to_db(db_controller.kv_store,tr)
+      snap.write_to_db(db_controller.kv_store,tr)
 
-    if lvol.ha_type == "single":
+      if lvol.ha_type == "single":
         lvol_bdev, error = lvol_controller.add_lvol_on_node(lvol, snode)
         if error:
+            msg = f"Could not add lvol on node {lvol.uuid}"
+            logger.error(msg)
             return False, error
         lvol.nodes = [snode.get_id()]
         lvol.lvol_uuid = lvol_bdev['uuid']
         lvol.blobid = lvol_bdev['driver_specific']['lvol']['blobid']
 
-    if lvol.ha_type == "ha":
+      if lvol.ha_type == "ha":
         host_node = snode
         lvol.nodes = [host_node.get_id(), host_node.secondary_node_id]
         primary_node = None
@@ -552,7 +603,6 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None
                 if sec_node.status == StorageNode.STATUS_DOWN:
                     msg = "Secondary node is in down status, can not clone snapshot"
                     logger.error(msg)
-                    lvol.remove(db_controller.kv_store)
                     return False, msg
 
                 if sec_node.status == StorageNode.STATUS_ONLINE:
@@ -588,8 +638,8 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None
         if primary_node:
             lvol_bdev, error = lvol_controller.add_lvol_on_node(lvol, primary_node)
             if error:
-                logger.error(error)
-                lvol.remove(db_controller.kv_store)
+                msg = f"Cannot add lvol to node: {lvol.uuid}"
+                logger.error(msg)
                 return False, error
 
             lvol.lvol_uuid = lvol_bdev['uuid']
@@ -598,21 +648,25 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None
         if secondary_node:
             lvol_bdev, error = lvol_controller.add_lvol_on_node(lvol, secondary_node, is_primary=False)
             if error:
+                msg = f"Cannot add lvol to secondary node: {lvol.uuid}"
                 logger.error(error)
-                lvol.remove(db_controller.kv_store)
                 return False, error
 
-    lvol.status = LVol.STATUS_ONLINE
-    lvol.write_to_db(db_controller.kv_store)
+      lvol.status = LVol.STATUS_ONLINE
+      lvol.write_to_db(db_controller.kv_store)
 
-    if snap.snap_ref_id:
+      if snap.snap_ref_id:
         ref_snap = db_controller.get_snapshot_by_id(snap.snap_ref_id)
         ref_snap.ref_count += 1
         ref_snap.write_to_db(db_controller.kv_store)
-    else:
+      else:
         snap.ref_count += 1
         snap.write_to_db(db_controller.kv_store)
 
+      tr.commit.wait()
+    except:
+       raise f"could not create clone: {lvol.uuid}"
+
     logger.info("Done")
     snapshot_events.snapshot_clone(snap, lvol)
     if new_size:
diff --git a/simplyblock_core/models/base_model.py b/simplyblock_core/models/base_model.py
index 23640c816..b5c9b171d 100644
--- a/simplyblock_core/models/base_model.py
+++ b/simplyblock_core/models/base_model.py
@@ -143,18 +143,36 @@ def get_last(self, kv_store):
             return objects[0]
         return None
 
-    def write_to_db(self, kv_store=None):
-        if not kv_store:
+    def write_to_db(self, kv_store=None, transaction=None):
+        """
+        Write this object to FDB. Can either:
+        - use an existing transaction (transaction parameter), or
+        - use kv_store for a standalone write.
+
+        Note: If you pass a transaction, commit must be done outside this function.
+        """
+        if not kv_store and not transaction:
             from simplyblock_core.db_controller import DBController
             kv_store = DBController().kv_store
+
         try:
             prefix = self.get_db_id()
             st = json.dumps(self.to_dict())
-            kv_store.set(prefix.encode(), st.encode())
+
+            if transaction:
+                # Use the existing transaction, don't commit here
+                transaction.set(prefix.encode(), st.encode())
+            else:
+                # Create a new transaction for standalone write
+                tr = kv_store.create_transaction()
+                tr.set(prefix.encode(), st.encode())
+                tr.commit().wait()
+
             return True
+
         except Exception as e:
             print(f"Error Writing to FDB! {e}")
-            exit(1)
+            raise  # Better than exit, let caller handle
 
     def remove(self, kv_store):
         prefix = self.get_db_id()
diff --git a/simplyblock_core/models/lvol_migration.py b/simplyblock_core/models/lvol_migration.py
index e4dc477bc..01b18fc74 100644
--- a/simplyblock_core/models/lvol_migration.py
+++ b/simplyblock_core/models/lvol_migration.py
@@ -1,11 +1,10 @@
 from __future__ import annotations
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from enum import Enum
-from typing import List, Optional
-import uuid
-import asyncio
+from typing import List
 import storage_node
 from base_model import *
+from simplyblock_core.rpc_client import RPCClient
 
 
 # ---------------------------------------------------------------------------
@@ -13,34 +12,34 @@
 # ---------------------------------------------------------------------------
 
 class MigrationState(str, Enum):
-    NEW = "new"
-    PREPARING = "preparing"
-    RUNNING = "running"
-    SNAPS_MIGRATED = "migrated"
-    TARGET_LVOL_CREATED = "target_lvol_created"
-    HUBLVOL_CONNECTED = "hublvol_connecte"
-    TRANSFERRED_TO_TARGET = "transferred_to_target"
-    RECONNECT_DONE = "reconnect_done"
-    CLEANUP = "cleanup"
-    FAILED = "failed"
-    FAILED_AND_CLEANED = "failed_and_cleaned"
-    DONE = "done"
+    NEW = 0
+    PREPARING = 1
+    RUNNING = 2
+    SNAPS_MIGRATED = 3
+    TARGET_LVOL_CREATED = 4
+    HUBLVOL_CONNECTED = 5
+    TRANSFERRED_TO_TARGET = 6
+    RECONNECT_DONE = 6
+    CLEANUP = 7
+    FAILED = 8
+    FAILED_AND_CLEANED = 9
+    DONE = 10
 
 class ObjectMigrationState(str, Enum):
-    NEW = "new"
-    LVOL_CREATED = "lvolcreated"
-    MIG_FLAG_SET = "migflagset"
-    NAMESPACE_CREATED = "nscreated"
-    NQN_CREATED = "nqncreated"
-    LVOL_CONNECTED = "lvolconnected"
-    LVOL_EXPORTED = "lvol_exported"
-    TRANSFER = "transferring"
-    RETRANSFER = "retransfer"
-    TRANSFERRED = "transferred"
-    CONVERTED = "converted"
-    CLEANING = "cleaning"
-    FAILED = "failed"
-    DONE = "done"
+    NEW = 1
+    LVOL_CREATED = 2
+    MIG_FLAG_SET = 3
+    NAMESPACE_CREATED = 4
+    NQN_CREATED = 5
+    LVOL_CONNECTED = 6
+    LVOL_EXPORTED = 7
+    TRANSFER = 8
+    RETRANSFER = 9
+    TRANSFERRED = 10
+    CONVERTED = 11
+    CLEANING = 12
+    FAILED = 13
+    DONE = 14
 
 # ---------------------------------------------------------------------------
 # DATA MODELS
@@ -101,6 +100,10 @@ class MigrationObject(BaseModel):
     node_sec: storage_node.StorageNode = None
     target_node_pri: storage_node.StorageNode = None
     target_node_sec: storage_node.StorageNode = None
+    rpc_client1: RPCClient = None
+    rpc_client2: RPCClient = None
+    rpc_client3: RPCClient = None
+    rpc_client4: RPCClient = None
 
     # Global snapshot objects (shared across streams)
     snapshots: List[Snapshot] = None
diff --git a/simplyblock_core/models/lvol_model.py b/simplyblock_core/models/lvol_model.py
index fd0ca3356..baf69477c 100644
--- a/simplyblock_core/models/lvol_model.py
+++ b/simplyblock_core/models/lvol_model.py
@@ -67,6 +67,7 @@ class LVol(BaseModel):
     ndcs: int = 0
     npcs: int = 0
     frozen: bool = False
+    last_snapshot_uuid: str = ""
 
     def has_qos(self):
         return (self.rw_ios_per_sec > 0 or self.rw_mbytes_per_sec > 0 or self.r_mbytes_per_sec > 0 or self.w_mbytes_per_sec > 0)
diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py
index fdbdde8ea..0e76b56dd 100644
--- a/simplyblock_core/models/snapshot.py
+++ b/simplyblock_core/models/snapshot.py
@@ -2,10 +2,21 @@
 
 from simplyblock_core.models.base_model import BaseModel
 from simplyblock_core.models.lvol_model import LVol
+from typing import List
+
+
+class SnapshotRef():
+
+    TYPE_LVOL = "lvol"
+    TYPE_CLONE = "clone"
+    TYPE_SNAP = "snap"
+    type: str
+    next: str
 
 
 class SnapShot(BaseModel):
 
+
     STATUS_ONLINE = 'online'
     STATUS_OFFLINE = 'offline'
     STATUS_IN_DELETION = 'in_deletion'
@@ -31,3 +42,5 @@ class SnapShot(BaseModel):
     fabric: str = "tcp"
     frozen: bool = False
     node_id : str = ""
+    successor : List[SnapshotRef] = []
+    predecessor: str = ""

From f412121e57aba9a4f03a49a2c222fe0199c6ec11 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 2 Dec 2025 00:36:28 +0300
Subject: [PATCH 31/68] Create partitions and alcemls on node add in parallel
 (#763) (#785)

* Create partitions and alcemls on node add in parallel

* fix 1

* connect to remote alcemls in parallel

* Create distrib bdevs in parallel

* Create distrib bdevs in parallel

* prepare for merge

* Fix sfam-2485
---
 simplyblock_core/storage_node_ops.py | 185 +++++++++++++++------------
 1 file changed, 102 insertions(+), 83 deletions(-)

diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index 162f0dd1a..719284ab4 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -413,8 +413,8 @@ def _create_storage_device_stack(rpc_client, nvme, snode, after_restart):
     return nvme
 
 
-def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, jm_percent, partition_size=0):
-    nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev)
+def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, jm_percent, partition_size, nbd_index):
+    nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev, f"/dev/nbd{nbd_index}")
     time.sleep(3)
     if not nbd_device:
         logger.error("Failed to start nbd dev")
@@ -447,79 +447,84 @@ def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, j
 
 def _prepare_cluster_devices_partitions(snode, devices):
     db_controller = DBController()
-    rpc_client = RPCClient(
-        snode.mgmt_ip, snode.rpc_port,
-        snode.rpc_username, snode.rpc_password)
-
     new_devices = []
-    jm_devices = []
-    dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
-    bdevs_names = [d['name'] for d in rpc_client.get_bdevs()]
+    devices_to_partition = []
+    thread_list = []
     for index, nvme in enumerate(devices):
         if nvme.status == "not_found":
             continue
-
         if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_NEW]:
             logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
             new_devices.append(nvme)
             continue
-
         if nvme.is_partition:
-            dev_part = f"{nvme.nvme_bdev[:-2]}p1"
-            if dev_part in bdevs_names:
-                if dev_part not in jm_devices:
-                    jm_devices.append(dev_part)
-
-            new_device = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=False)
-            if not new_device:
-                logger.error("failed to create dev stack")
-                return False
-            new_devices.append(new_device)
-            if new_device.status == NVMeDevice.STATUS_ONLINE:
-                device_events.device_create(new_device)
-
+            t = threading.Thread(target=_create_storage_device_stack, args=(snode.rpc_client(), nvme, snode, False,))
+            thread_list.append(t)
+            new_devices.append(nvme)
+            t.start()
         else:
-            # look for partitions
-            partitioned_devices = _search_for_partitions(rpc_client, nvme)
-            logger.debug("partitioned_devices")
-            logger.debug(partitioned_devices)
-            if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
-                logger.info("Partitioned devices found")
-            else:
+            devices_to_partition.append(nvme)
+            partitioned_devices = _search_for_partitions(snode.rpc_client(), nvme)
+            if len(partitioned_devices) != (1 + snode.num_partitions_per_dev):
                 logger.info(f"Creating partitions for {nvme.nvme_bdev}")
-                _create_device_partitions(rpc_client, nvme, snode, snode.num_partitions_per_dev, snode.jm_percent,
-                                          snode.partition_size)
-                partitioned_devices = _search_for_partitions(rpc_client, nvme)
-                if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
-                    logger.info("Device partitions created")
-                else:
-                    logger.error("Failed to create partitions")
-                    return False
+                t = threading.Thread(
+                    target=_create_device_partitions,
+                    args=(snode.rpc_client(), nvme, snode, snode.num_partitions_per_dev,
+                          snode.jm_percent, snode.partition_size, index+1,))
+                thread_list.append(t)
+                t.start()
 
-            jm_devices.append(partitioned_devices.pop(0).nvme_bdev)
+    for thread in thread_list:
+        thread.join()
 
+    thread_list = []
+    for nvme in devices_to_partition:
+        partitioned_devices = _search_for_partitions(snode.rpc_client(), nvme)
+        if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
+            logger.info("Device partitions created")
+            # remove 1st partition for jm
+            partitioned_devices.pop(0)
             for dev in partitioned_devices:
-                ret = _create_storage_device_stack(rpc_client, dev, snode, after_restart=False)
-                if not ret:
-                    logger.error("failed to create dev stack")
-                    return False
-                if dev.status == NVMeDevice.STATUS_ONLINE:
-                    if dev.cluster_device_order < 0:
-                        dev.cluster_device_order = dev_order
-                        dev_order += 1
-                    device_events.device_create(dev)
+                t = threading.Thread(target=_create_storage_device_stack,
+                                     args=(snode.rpc_client(), dev, snode, False,))
+                thread_list.append(t)
                 new_devices.append(dev)
+                t.start()
+        else:
+            logger.error("Failed to create partitions")
+            return False
 
-    snode.nvme_devices = new_devices
+    for thread in thread_list:
+        thread.join()
+
+    # assign device order
+    dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
+    for nvme in new_devices:
+        if nvme.status == NVMeDevice.STATUS_ONLINE:
+            if nvme.cluster_device_order < 0:
+                nvme.cluster_device_order = dev_order
+                dev_order += 1
+        device_events.device_create(nvme)
+
+    # create jm device
+    jm_devices = []
+    bdevs_names = [d['name'] for d in snode.rpc_client().get_bdevs()]
+    for nvme in new_devices:
+        if nvme.status == NVMeDevice.STATUS_ONLINE:
+            dev_part = f"{nvme.nvme_bdev[:-2]}p1"
+            if dev_part in bdevs_names:
+                if dev_part not in jm_devices:
+                    jm_devices.append(dev_part)
 
     if jm_devices:
-        jm_device = _create_jm_stack_on_raid(rpc_client, jm_devices, snode, after_restart=False)
+        jm_device = _create_jm_stack_on_raid(snode.rpc_client(), jm_devices, snode, after_restart=False)
         if not jm_device:
             logger.error("Failed to create JM device")
             return False
 
         snode.jm_device = jm_device
 
+    snode.nvme_devices = new_devices
     return True
 
 
@@ -701,6 +706,8 @@ def _connect_to_remote_devs(
         allowed_node_statuses.append(StorageNode.STATUS_RESTARTING)
         allowed_dev_statuses.append(NVMeDevice.STATUS_UNAVAILABLE)
 
+    devices_to_connect = []
+    connect_threads = []
     nodes = db_controller.get_storage_nodes_by_cluster_id(this_node.cluster_id)
     # connect to remote devs
     for node_index, node in enumerate(nodes):
@@ -715,12 +722,29 @@ def _connect_to_remote_devs(
 
             if not dev.alceml_bdev:
                 raise ValueError(f"device alceml bdev not found!, {dev.get_id()}")
+            devices_to_connect.append(dev)
+            t = threading.Thread(
+                target=connect_device,
+                args=(f"remote_{dev.alceml_bdev}", dev, this_node, node_bdev_names, reattach,))
+            connect_threads.append(t)
+            t.start()
 
-            dev.remote_bdev = connect_device(
-                    f"remote_{dev.alceml_bdev}", dev, this_node,
-                    bdev_names=node_bdev_names, reattach=reattach,
-            )
-            remote_devices.append(dev)
+    for t in connect_threads:
+        t.join()
+
+    node_bdevs = rpc_client.get_bdevs()
+    if node_bdevs:
+        node_bdev_names = [b['name'] for b in node_bdevs]
+
+    for dev in devices_to_connect:
+        for bdev in node_bdev_names:
+            if bdev.startswith(f"remote_{dev.alceml_bdev}"):
+                dev.remote_bdev = bdev
+                break
+        if not dev.remote_bdev:
+            logger.error(f"Failed to connect to remote device {dev.alceml_name}")
+            continue
+        remote_devices.append(dev)
 
     return remote_devices
 
@@ -1980,7 +2004,7 @@ def restart_storage_node(
                     logger.error('Failed to connect to remote devices')
                     return False
                 node.write_to_db(kv_store)
-                    
+
 
             logger.info("Sending device status event")
             snode = db_controller.get_storage_node_by_id(snode.get_id())
@@ -2137,21 +2161,6 @@ def list_storage_devices(node_id, is_json):
             "Health": snode.jm_device.health_check
         })
 
-    for jm_id in snode.jm_ids:
-        try:
-            jm_device = db_controller.get_jm_device_by_id(jm_id)
-        except KeyError:
-            continue
-
-        jm_devices.append({
-            "UUID": jm_device.uuid,
-            "Name": jm_device.device_name,
-            "Size": utils.humanbytes(jm_device.size),
-            "Status": jm_device.status,
-            "IO Err": jm_device.io_error,
-            "Health": jm_device.health_check
-        })
-
     for device in snode.remote_devices:
         logger.debug(device)
         logger.debug("*" * 20)
@@ -3604,6 +3613,15 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo
 
 
 def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None):
+    def _create_distr(snode, name, params):
+        try:
+            rpc_client.bdev_distrib_create(**params)
+        except Exception:
+            logger.error("Failed to create bdev distrib")
+        ret = distr_controller.send_cluster_map_to_distr(snode, name)
+        if not ret:
+            logger.error("Failed to send cluster map")
+
     rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
     db_controller = DBController()
     cluster = db_controller.get_cluster_by_id(snode.cluster_id)
@@ -3620,11 +3638,11 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None):
     else:
         node_bdev_names = []
 
+    thread_list = []
     for bdev in stack:
         type = bdev['type']
         name = bdev['name']
         params = bdev['params']
-
         if name in node_bdev_names:
             continue
 
@@ -3640,23 +3658,21 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None):
                 snode.distrib_cpu_index = (snode.distrib_cpu_index + 1) % len(snode.distrib_cpu_cores)
 
             params['full_page_unmap'] = cluster.full_page_unmap
-            ret = rpc_client.bdev_distrib_create(**params)
-            if ret:
-                ret = distr_controller.send_cluster_map_to_distr(snode, name)
-                if not ret:
-                    return False, "Failed to send cluster map"
-                # time.sleep(1)
+            t = threading.Thread(target=_create_distr, args=(snode, name, params,))
+            thread_list.append(t)
+            t.start()
+            ret = True
 
         elif type == "bdev_lvstore" and lvstore_stack and not primary_node:
-            ret = rpc_client.create_lvstore(**params)
-            # if ret and snode.jm_vuid > 0:
-            #     rpc_client.bdev_lvol_set_lvs_ops(snode.lvstore, snode.jm_vuid, snode.lvol_subsys_port)
+                ret = rpc_client.create_lvstore(**params)
 
         elif type == "bdev_ptnonexcl":
             ret = rpc_client.bdev_PT_NoExcl_create(**params)
 
         elif type == "bdev_raid":
-
+            if thread_list:
+                for t in thread_list:
+                    t.join()
             distribs_list = bdev["distribs_list"]
             strip_size_kb = params["strip_size_kb"]
             ret = rpc_client.bdev_raid_create(name, distribs_list, strip_size_kb=strip_size_kb)
@@ -3674,6 +3690,9 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None):
                 _remove_bdev_stack(created_bdevs[::-1], rpc_client)
             return False, f"Failed to create BDev: {name}"
 
+    if thread_list:
+        for t in thread_list:
+            t.join()
     return True, None
 
 

From 3c60a2c0f5561680047db0f28aa0b0b09d758409 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 2 Dec 2025 00:36:40 +0300
Subject: [PATCH 32/68] Remove stats from fdb and get it from Prometheus (#762)
 (#786)

* expose prometheus port 9090 using HAProxy

* wip 3

* wip 4

* wip 5

* wip 6

* keep only 10 records for device,node,cluster stats in fdb

* wip7

* update env_var image

* fix 1

* remove connected_clients from lvol stats

* Fix pool api

* Fix linter issues and type checker

* prepare for merge

* Fix history param parser

* fix prom get_metrics param end_time
---
 requirements.txt                              |   1 +
 simplyblock_core/cluster_ops.py               |  36 +++--
 .../controllers/device_controller.py          |  18 +--
 .../controllers/lvol_controller.py            |  27 +---
 .../controllers/pool_controller.py            |  20 +--
 simplyblock_core/prom_client.py               | 126 ++++++++++++++++++
 .../services/capacity_and_stats_collector.py  |  15 +++
 .../services/lvol_stat_collector.py           |  11 ++
 simplyblock_core/storage_node_ops.py          |  27 +---
 simplyblock_web/api/v1/pool.py                |  35 ++---
 simplyblock_web/api/v2/pool.py                |   4 +-
 11 files changed, 211 insertions(+), 109 deletions(-)
 create mode 100644 simplyblock_core/prom_client.py

diff --git a/requirements.txt b/requirements.txt
index 030cca8e0..9ee458f00 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,3 +24,4 @@ flask-openapi3
 jsonschema
 fastapi
 uvicorn
+prometheus_api_client
\ No newline at end of file
diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index dc429b8f9..24be657d7 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -25,6 +25,7 @@
 from simplyblock_core.models.stats import LVolStatObject, ClusterStatObject, NodeStatObject, DeviceStatObject
 from simplyblock_core.models.nvme_device import NVMeDevice
 from simplyblock_core.models.storage_node import StorageNode
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.utils import pull_docker_image_with_retry
 
 logger = utils.get_logger(__name__)
@@ -1001,16 +1002,11 @@ def list_all_info(cluster_id) -> str:
 
 
 def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]:
-    cluster = db_controller.get_cluster_by_id(cluster_id)
-
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            raise ValueError(f"Error parsing history string: {history}")
-    else:
-        records_number = 20
-
-    records = db_controller.get_cluster_capacity(cluster, records_number)
+    try:
+        _ = db_controller.get_cluster_by_id(cluster_id)
+    except KeyError:
+        logger.error(f"Cluster not found: {cluster_id}")
+        return []
 
     cap_stats_keys = [
         "date",
@@ -1021,20 +1017,17 @@ def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]:
         "size_util",
         "size_prov_util",
     ]
+    prom_client = PromClient(cluster_id)
+    records = prom_client.get_cluster_metrics(cluster_id, cap_stats_keys, history)
     return utils.process_records(records, records_count, keys=cap_stats_keys)
 
 
 def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes=False) -> t.List[dict]:
-    cluster = db_controller.get_cluster_by_id(cluster_id)
-
-    if history_string:
-        records_number = utils.parse_history_param(history_string)
-        if not records_number:
-            raise ValueError(f"Error parsing history string: {history_string}")
-    else:
-        records_number = 20
-
-    records = db_controller.get_cluster_stats(cluster, records_number)
+    try:
+        _ = db_controller.get_cluster_by_id(cluster_id)
+    except KeyError:
+        logger.error(f"Cluster not found: {cluster_id}")
+        return []
 
     io_stats_keys = [
         "date",
@@ -1072,6 +1065,9 @@ def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes
                 "write_latency_ticks",
             ]
         )
+
+    prom_client = PromClient(cluster_id)
+    records = prom_client.get_cluster_metrics(cluster_id, io_stats_keys, history_string)
     # combine records
     return utils.process_records(records, records_count, keys=io_stats_keys)
 
diff --git a/simplyblock_core/controllers/device_controller.py b/simplyblock_core/controllers/device_controller.py
index 8e684c942..6f7a0d9f5 100644
--- a/simplyblock_core/controllers/device_controller.py
+++ b/simplyblock_core/controllers/device_controller.py
@@ -6,6 +6,7 @@
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
 from simplyblock_core.models.storage_node import StorageNode
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.rpc_client import RPCClient
 
 
@@ -440,7 +441,7 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True):
     else:
         records_number = 20
 
-    records = db_controller.get_device_capacity(device, records_number)
+    # records = db_controller.get_device_capacity(device, records_number)
     cap_stats_keys = [
         "date",
         "size_total",
@@ -448,6 +449,8 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True):
         "size_free",
         "size_util",
     ]
+    prom_client = PromClient(device.cluster_id)
+    records = prom_client.get_device_metrics(device_id, cap_stats_keys, history)
     records_list = utils.process_records(records, records_count, keys=cap_stats_keys)
 
     if not parse_sizes:
@@ -474,15 +477,6 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True):
         logger.error("device not found")
         return False
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records_list = db_controller.get_device_stats(device, records_number)
     io_stats_keys = [
         "date",
         "read_bytes",
@@ -496,8 +490,10 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True):
         "write_io_ps",
         "write_latency_ps",
     ]
+    prom_client = PromClient(device.cluster_id)
+    records = prom_client.get_device_metrics(device_id, io_stats_keys, history)
     # combine records
-    new_records = utils.process_records(records_list, records_count, keys=io_stats_keys)
+    new_records = utils.process_records(records, records_count, keys=io_stats_keys)
 
     if not parse_sizes:
         return new_records
diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py
index 4d7a5aad3..be8c4fc55 100644
--- a/simplyblock_core/controllers/lvol_controller.py
+++ b/simplyblock_core/controllers/lvol_controller.py
@@ -15,6 +15,7 @@
 from simplyblock_core.models.pool import Pool
 from simplyblock_core.models.lvol_model import LVol
 from simplyblock_core.models.storage_node import StorageNode
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.rpc_client import RPCClient
 
 logger = lg.getLogger()
@@ -1521,19 +1522,11 @@ def get_capacity(lvol_uuid, history, records_count=20, parse_sizes=True):
     db_controller = DBController()
     try:
         lvol = db_controller.get_lvol_by_id(lvol_uuid)
+        pool = db_controller.get_pool_by_id(lvol.pool_uuid)
     except KeyError as e:
         logger.error(e)
         return False
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records_list = db_controller.get_lvol_stats(lvol, limit=records_number)
     cap_stats_keys = [
         "date",
         "size_total",
@@ -1543,6 +1536,8 @@ def get_capacity(lvol_uuid, history, records_count=20, parse_sizes=True):
         "size_prov",
         "size_prov_util"
     ]
+    prom_client = PromClient(pool.cluster_id)
+    records_list = prom_client.get_lvol_metrics(lvol_uuid, cap_stats_keys, history)
     new_records = utils.process_records(records_list, records_count, keys=cap_stats_keys)
 
     if not parse_sizes:
@@ -1564,19 +1559,11 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si
     db_controller = DBController()
     try:
         lvol = db_controller.get_lvol_by_id(lvol_uuid)
+        pool = db_controller.get_pool_by_id(lvol.pool_uuid)
     except KeyError as e:
         logger.error(e)
         return False
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records_list = db_controller.get_lvol_stats(lvol, limit=records_number)
     io_stats_keys = [
         "date",
         "read_bytes",
@@ -1587,7 +1574,6 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si
         "write_bytes_ps",
         "write_io_ps",
         "write_latency_ps",
-        "connected_clients",
     ]
     if with_sizes:
         io_stats_keys.extend(
@@ -1612,6 +1598,8 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si
                 "write_latency_ticks",
             ]
         )
+    prom_client = PromClient(pool.cluster_id)
+    records_list = prom_client.get_lvol_metrics(lvol_uuid, io_stats_keys, history)
     # combine records
     new_records = utils.process_records(records_list, records_count, keys=io_stats_keys)
 
@@ -1630,7 +1618,6 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si
             "Write speed": utils.humanbytes(record['write_bytes_ps']),
             "Write IOPS": record['write_io_ps'],
             "Write lat": record['write_latency_ps'],
-            "Con": record['connected_clients'],
         })
     return out
 
diff --git a/simplyblock_core/controllers/pool_controller.py b/simplyblock_core/controllers/pool_controller.py
index db7016d7d..2440a6bd7 100644
--- a/simplyblock_core/controllers/pool_controller.py
+++ b/simplyblock_core/controllers/pool_controller.py
@@ -12,6 +12,7 @@
 from simplyblock_core.controllers import pool_events, lvol_controller
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.models.pool import Pool
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.rpc_client import RPCClient
 
 logger = lg.getLogger()
@@ -321,15 +322,18 @@ def get_io_stats(pool_id, history, records_count=20):
         logger.error(f"Pool not found {pool_id}")
         return False
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
+    io_stats_keys = [
+        "date",
+        "read_bytes_ps",
+        "read_io_ps",
+        "read_latency_ps",
+        "write_bytes_ps",
+        "write_io_ps",
+        "write_latency_ps",
+    ]
 
-    out = db_controller.get_pool_stats(pool, records_number)
+    prom_client = PromClient(pool.cluster_id)
+    out = prom_client.get_pool_metrics(pool_id, io_stats_keys, history)
     new_records = utils.process_records(out, records_count)
 
     return utils.print_table([
diff --git a/simplyblock_core/prom_client.py b/simplyblock_core/prom_client.py
new file mode 100644
index 000000000..82756161b
--- /dev/null
+++ b/simplyblock_core/prom_client.py
@@ -0,0 +1,126 @@
+import logging
+import re
+from datetime import datetime, timedelta
+
+from simplyblock_core.db_controller import DBController
+from simplyblock_core.models.mgmt_node import MgmtNode
+
+from prometheus_api_client import PrometheusConnect
+
+logger = logging.getLogger()
+
+
+class PromClientException(Exception):
+    def __init__(self, message):
+        self.message = message
+
+
+class PromClient:
+
+    def __init__(self, cluster_id):
+        db_controller = DBController()
+        cluster_ip = None
+        for node in db_controller.get_mgmt_nodes():
+            if node.cluster_id == cluster_id and node.status == MgmtNode.STATUS_ONLINE:
+                cluster_ip = node.mgmt_ip
+                break
+        if cluster_ip is None:
+            raise PromClientException("Cluster has no online mgmt nodes")
+
+        self.ip_address = f"{cluster_ip}:9090"
+        self.url = 'http://%s/' % self.ip_address
+        self.client = PrometheusConnect(url=self.url, disable_ssl=True)
+
+    def parse_history_param(self, history_string):
+        if not history_string:
+            logger.error("Invalid history value")
+            return False
+
+        # process history
+        results = re.search(r'^(\d+[hmd])(\d+[hmd])?$', history_string.lower())
+        if not results:
+            logger.error(f"Error parsing history string: {history_string}")
+            logger.info("History format: xxdyyh , e.g: 1d12h, 1d, 2h, 1m")
+            return False
+
+        history_in_days = 0
+        history_in_hours = 0
+        history_in_minutes = 0
+        for s in results.groups():
+            if not s:
+                continue
+            ind = s[-1]
+            v = int(s[:-1])
+            if ind == 'd':
+                history_in_days = v
+            if ind == 'h':
+                history_in_hours = v
+            if ind == 'm':
+                history_in_minutes = v
+
+        history_in_hours += int(history_in_minutes/60)
+        history_in_minutes = history_in_minutes % 60
+        history_in_days += int(history_in_hours/24)
+        history_in_hours = history_in_hours % 24
+        return history_in_days, history_in_hours, history_in_minutes
+
+    def get_metrics(self, key_prefix, metrics_lst, params, history=None):
+        start_time = datetime.now() - timedelta(minutes=10)
+        if history:
+            try:
+                days,hours,minutes = self.parse_history_param(history)
+                start_time = datetime.now() - timedelta(days=days, hours=hours, minutes=minutes)
+            except Exception:
+                raise PromClientException(f"Error parsing history string: {history}")
+        end_time = datetime.now()
+        data_out: list[dict] = []
+        for key in metrics_lst:
+            metrics = self.client.get_metric_range_data(
+                f"{key_prefix}_{key}", label_config=params, start_time=start_time, end_time=end_time)
+            for m in metrics:
+                mt_name = key
+                mt_values = m["values"]
+                for i, v in enumerate(mt_values):
+                    value = v[1]
+                    try:
+                        value = int(value)
+                    except Exception:
+                        pass
+                    if len(data_out) <= i:
+                        data_out.append({mt_name: value})
+                    else:
+                        d = data_out[i]
+                        if mt_name not in d:
+                            d[mt_name] = value
+
+        return data_out
+
+    def get_cluster_metrics(self, cluster_uuid, metrics_lst, history=None):
+        params = {
+            "cluster": cluster_uuid
+        }
+        return self.get_metrics("cluster", metrics_lst, params, history)
+
+    def get_node_metrics(self, snode_uuid, metrics_lst, history=None):
+        params = {
+            "snode": snode_uuid
+        }
+        return self.get_metrics("snode", metrics_lst, params, history)
+
+    def get_device_metrics(self, device_uuid, metrics_lst, history=None):
+        params = {
+            "device": device_uuid
+        }
+        return self.get_metrics("device", metrics_lst, params, history)
+
+    def get_lvol_metrics(self, lvol_uuid, metrics_lst, history=None):
+        params = {
+            "lvol": lvol_uuid
+        }
+        return self.get_metrics("lvol", metrics_lst, params, history)
+
+    def get_pool_metrics(self, pool_uuid, metrics_lst, history=None):
+        params = {
+            "pool": pool_uuid
+        }
+        return self.get_metrics("pool", metrics_lst, params, history)
diff --git a/simplyblock_core/services/capacity_and_stats_collector.py b/simplyblock_core/services/capacity_and_stats_collector.py
index 6f702d051..022dd84b5 100644
--- a/simplyblock_core/services/capacity_and_stats_collector.py
+++ b/simplyblock_core/services/capacity_and_stats_collector.py
@@ -83,6 +83,11 @@ def add_device_stats(cl, device, capacity_dict, stats_dict):
     stat_obj.write_to_db(db.kv_store)
     last_object_record[device.get_id()] = stat_obj
 
+    all_stats = db.get_device_stats(device, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
@@ -117,6 +122,11 @@ def add_node_stats(node, records):
     stat_obj = NodeStatObject(data=data)
     stat_obj.write_to_db(db.kv_store)
 
+    all_stats = db.get_node_stats(node, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
@@ -146,6 +156,11 @@ def add_cluster_stats(cl, records):
     stat_obj = ClusterStatObject(data=data)
     stat_obj.write_to_db(db.kv_store)
 
+    all_stats = db.get_cluster_stats(cl, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
diff --git a/simplyblock_core/services/lvol_stat_collector.py b/simplyblock_core/services/lvol_stat_collector.py
index 09aa7d571..1933b6703 100644
--- a/simplyblock_core/services/lvol_stat_collector.py
+++ b/simplyblock_core/services/lvol_stat_collector.py
@@ -154,6 +154,11 @@ def add_lvol_stats(cluster, lvol, stats_list, capacity_dict=None):
     stat_obj.write_to_db(db.kv_store)
     last_object_record[lvol.get_id()] = stat_obj
 
+    all_stats = db.get_lvol_stats(lvol, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
@@ -173,6 +178,12 @@ def add_pool_stats(pool, records):
 
     stat_obj = PoolStatObject(data=data)
     stat_obj.write_to_db(db.kv_store)
+
+    all_stats = db.get_pool_stats(pool, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index 719284ab4..9b6630680 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -31,6 +31,7 @@
 from simplyblock_core.models.snapshot import SnapShot
 from simplyblock_core.models.storage_node import StorageNode
 from simplyblock_core.models.cluster import Cluster
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.rpc_client import RPCClient, RPCException
 from simplyblock_core.snode_client import SNodeClient, SNodeClientException
 from simplyblock_web import node_utils
@@ -2474,20 +2475,11 @@ def resume_storage_node(node_id):
 def get_node_capacity(node_id, history, records_count=20, parse_sizes=True):
     db_controller = DBController()
     try:
-        this_node = db_controller.get_storage_node_by_id(node_id)
+        node = db_controller.get_storage_node_by_id(node_id)
     except KeyError:
         logger.error("Storage node Not found")
         return
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records = db_controller.get_node_capacity(this_node, records_number)
     cap_stats_keys = [
         "date",
         "size_total",
@@ -2497,6 +2489,8 @@ def get_node_capacity(node_id, history, records_count=20, parse_sizes=True):
         "size_util",
         "size_prov_util",
     ]
+    prom_client = PromClient(node.cluster_id)
+    records = prom_client.get_node_metrics(node_id, cap_stats_keys, history)
     new_records = utils.process_records(records, records_count, keys=cap_stats_keys)
 
     if not parse_sizes:
@@ -2523,17 +2517,6 @@ def get_node_iostats_history(node_id, history, records_count=20, parse_sizes=Tru
     except KeyError:
         logger.error("node not found")
         return False
-
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records = db_controller.get_node_stats(node, records_number)
-
     io_stats_keys = [
         "date",
         "read_bytes",
@@ -2571,6 +2554,8 @@ def get_node_iostats_history(node_id, history, records_count=20, parse_sizes=Tru
                 "write_latency_ticks",
             ]
         )
+    prom_client = PromClient(node.cluster_id)
+    records = prom_client.get_node_metrics(node_id, io_stats_keys, history)
     # combine records
     new_records = utils.process_records(records, records_count, keys=io_stats_keys)
 
diff --git a/simplyblock_web/api/v1/pool.py b/simplyblock_web/api/v1/pool.py
index a24a9e9b7..3b4fe5f72 100644
--- a/simplyblock_web/api/v1/pool.py
+++ b/simplyblock_web/api/v1/pool.py
@@ -184,21 +184,10 @@ def pool_iostats(uuid, history):
     except KeyError:
         return utils.get_response_error(f"Pool not found: {uuid}", 404)
 
-    if history:
-        records_number = core_utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    out = db.get_pool_stats(pool, records_number)
-    records_count = 20
-    new_records = core_utils.process_records(out, records_count)
-
+    data = pool_controller.get_io_stats(uuid, history)
     ret = {
         "object_data": pool.get_clean_dict(),
-        "stats": new_records or []
+        "stats": data or []
     }
     return utils.get_response(ret)
 
@@ -207,21 +196,13 @@ def pool_iostats(uuid, history):
 @bp.route('/pool/iostats-all-lvols/<string:pool_uuid>', methods=['GET'])
 def lvol_iostats(pool_uuid):
     try:
-        db.get_pool_by_id(pool_uuid)
+        pool = db.get_pool_by_id(pool_uuid)
     except KeyError:
         return utils.get_response_error(f"Pool not found: {pool_uuid}", 404)
 
-    ret = []
-    for lvol in db.get_lvols_by_pool_id(pool_uuid):
-
-        records_list = db.get_lvol_stats(lvol, limit=1)
-
-        if records_list:
-            data = records_list[0].get_clean_dict()
-        else:
-            data = {}
-        ret.append({
-            "object_data": lvol.get_clean_dict(),
-            "stats": data
-        })
+    data = pool_controller.get_capacity(pool_uuid)
+    ret = {
+        "object_data": pool.get_clean_dict(),
+        "stats": data or []
+    }
     return utils.get_response(ret)
diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py
index c779f70ca..d34ce0b2a 100644
--- a/simplyblock_web/api/v2/pool.py
+++ b/simplyblock_web/api/v2/pool.py
@@ -122,5 +122,5 @@ def update(cluster: Cluster, pool: StoragePool, parameters: UpdatableStoragePool
 
 @instance_api.get('/iostats', name='clusters:storage-pools:iostats')
 def iostats(cluster: Cluster, pool: StoragePool, limit: int = 20):
-    records = db.get_pool_stats(pool, limit)
-    return core_utils.process_records(records, 20)
+    data = pool_controller.get_io_stats(pool.get_id(), history="")
+    return core_utils.process_records(data, 20)

From 6ddfd0b1d6b9912ba6f017412d2b3c246556945d Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 2 Dec 2025 01:55:14 +0300
Subject: [PATCH 33/68] Increase jc comp resume retry on node not online (#690)

---
 simplyblock_core/services/tasks_runner_jc_comp.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/simplyblock_core/services/tasks_runner_jc_comp.py b/simplyblock_core/services/tasks_runner_jc_comp.py
index 676156af3..6caf85b19 100644
--- a/simplyblock_core/services/tasks_runner_jc_comp.py
+++ b/simplyblock_core/services/tasks_runner_jc_comp.py
@@ -57,6 +57,7 @@
                         if node.status != StorageNode.STATUS_ONLINE:
                             msg = f"Node is {node.status}, retry task"
                             logger.info(msg)
+                            task.retry += 1
                             task.function_result = msg
                             task.status = JobSchedule.STATUS_SUSPENDED
                             task.write_to_db(db.kv_store)
@@ -79,6 +80,7 @@
                                     logger.info(msg)
                                     task.function_result = msg
                                     task.status = JobSchedule.STATUS_SUSPENDED
+                                    task.retry += 1
                                     task.write_to_db(db.kv_store)
                                     continue
 

From 8e3fe701bb3a6dad609ddaea9876b56a1b21421c Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 2 Dec 2025 16:22:08 +0300
Subject: [PATCH 34/68] Adds missing services to k8s mgmt (#788)

* Adds missing services to k8s mgmt

* added function create_k8s_service logic (#789)

* Add labels and missing service on upgrade

* fix linter

---------

Co-authored-by: Geoffrey Israel <israelgeoffrey13@gmail.com>
---
 simplyblock_core/cluster_ops.py               | 67 ++++++++-----
 .../scripts/charts/templates/app_k8s.yaml     | 51 ++++++++++
 simplyblock_core/utils/__init__.py            | 94 ++++++++++++++++++-
 3 files changed, 185 insertions(+), 27 deletions(-)

diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index 24be657d7..5cc9cb3ba 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -1180,44 +1180,43 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
                     service_names.append(service.attrs['Spec']['Name'])
 
         if "app_SnapshotMonitor" not in service_names:
-            logger.info("Creating snapshot monitor service")
-            cluster_docker.services.create(
-                image=service_image,
-                command="python simplyblock_core/services/snapshot_monitor.py",
-                name="app_SnapshotMonitor",
-                mounts=["/etc/foundationdb:/etc/foundationdb"],
-                env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"],
-                networks=["host"],
-                constraints=["node.role == manager"]
-            )
+            utils.create_docker_service(
+                cluster_docker=cluster_docker,
+                service_name="app_SnapshotMonitor",
+                service_file="python simplyblock_core/services/snapshot_monitor.py",
+                service_image=service_image)
 
         if "app_TasksRunnerLVolSyncDelete" not in service_names:
-            logger.info("Creating lvol sync delete service")
-            cluster_docker.services.create(
-                image=service_image,
-                command="python simplyblock_core/services/tasks_runner_sync_lvol_del.py",
-                name="app_TasksRunnerLVolSyncDelete",
-                mounts=["/etc/foundationdb:/etc/foundationdb"],
-                env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"],
-                networks=["host"],
-                constraints=["node.role == manager"]
-            )
+            utils.create_docker_service(
+                cluster_docker=cluster_docker,
+                service_name="app_TasksRunnerLVolSyncDelete",
+                service_file="python simplyblock_core/services/tasks_runner_sync_lvol_del.py",
+                service_image=service_image)
+
+        if "app_TasksRunnerJCCompResume" not in service_names:
+            utils.create_docker_service(
+                cluster_docker=cluster_docker,
+                service_name="app_TasksRunnerJCCompResume",
+                service_file="python simplyblock_core/services/tasks_runner_jc_comp.py",
+                service_image=service_image)
+
         logger.info("Done updating mgmt cluster")
 
     elif cluster.mode == "kubernetes":
         utils.load_kube_config_with_fallback()
         apps_v1 = k8s_client.AppsV1Api()
-
+        namespace = constants.K8S_NAMESPACE
         image_without_tag = constants.SIMPLY_BLOCK_DOCKER_IMAGE.split(":")[0]
         image_parts = "/".join(image_without_tag.split("/")[-2:])
         service_image = mgmt_image or constants.SIMPLY_BLOCK_DOCKER_IMAGE
-
+        deployment_names = []
         # Update Deployments
-        deployments = apps_v1.list_namespaced_deployment(namespace=constants.K8S_NAMESPACE)
+        deployments = apps_v1.list_namespaced_deployment(namespace=namespace)
         for deploy in deployments.items:
             if deploy.metadata.name == constants.ADMIN_DEPLOY_NAME:
                 logger.info(f"Skipping deployment {deploy.metadata.name}")
                 continue
+            deployment_names.append(deploy.metadata.name)
             for c in deploy.spec.template.spec.containers:
                 if image_parts in c.image:
                     logger.info(f"Updating deployment {deploy.metadata.name} image to {service_image}")
@@ -1227,12 +1226,28 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
                     deploy.spec.template.metadata.annotations = annotations
                     apps_v1.patch_namespaced_deployment(
                         name=deploy.metadata.name,
-                        namespace=constants.K8S_NAMESPACE,
+                        namespace=namespace,
                         body={"spec": {"template": deploy.spec.template}}
                     )
 
+        if "simplyblock-tasks-runner-sync-lvol-del" not in deployment_names:
+            utils.create_k8s_service(
+                namespace=namespace,
+                deployment_name="simplyblock-tasks-runner-sync-lvol-del",
+                container_name="tasks-runner-sync-lvol-del",
+                service_file="simplyblock_core/services/tasks_runner_sync_lvol_del.py",
+                container_image=service_image)
+
+        if "simplyblock-snapshot-monitor" not in deployment_names:
+            utils.create_k8s_service(
+                namespace=namespace,
+                deployment_name="simplyblock-snapshot-monitor",
+                container_name="snapshot-monitor",
+                service_file="simplyblock_core/services/snapshot_monitor.py",
+                container_image=service_image)
+
         # Update DaemonSets
-        daemonsets = apps_v1.list_namespaced_daemon_set(namespace=constants.K8S_NAMESPACE)
+        daemonsets = apps_v1.list_namespaced_daemon_set(namespace=namespace)
         for ds in daemonsets.items:
             for c in ds.spec.template.spec.containers:
                 if image_parts in c.image:
@@ -1243,7 +1258,7 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
                     ds.spec.template.metadata.annotations = annotations
                     apps_v1.patch_namespaced_daemon_set(
                         name=ds.metadata.name,
-                        namespace=constants.K8S_NAMESPACE,
+                        namespace=namespace,
                         body={"spec": {"template": ds.spec.template}}
                         )
 
diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml
index d17ea092a..49c7490b7 100644
--- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml
+++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml
@@ -1100,6 +1100,57 @@ spec:
             - key: cluster-file
               path: fdb.cluster
 ---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: simplyblock-tasks-runner-sync-lvol-del
+  namespace: {{ .Release.Namespace }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: simplyblock-tasks-runner-sync-lvol-del
+  template:
+    metadata:
+      annotations:
+        log-collector/enabled: "true"
+        reloader.stakater.com/auto: "true"
+        reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
+      labels:
+        app: simplyblock-tasks-runner-sync-lvol-del
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet
+      containers:
+        - name: tasks-runner-sync-lvol-del
+          image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
+          imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}"
+          command: ["python", "simplyblock_core/services/tasks_runner_sync_lvol_del.py"]
+          env:
+            - name: SIMPLYBLOCK_LOG_LEVEL
+              valueFrom:
+                configMapKeyRef:
+                  name: simplyblock-config
+                  key: LOG_LEVEL
+          volumeMounts:
+          - name: fdb-cluster-file
+            mountPath: /etc/foundationdb/fdb.cluster
+            subPath: fdb.cluster
+          resources:
+            requests:
+              cpu: "200m"
+              memory: "256Mi"
+            limits:
+              cpu: "400m"
+              memory: "1Gi"
+      volumes:
+      - name: fdb-cluster-file
+        configMap:
+          name: simplyblock-fdb-cluster-config
+          items:
+            - key: cluster-file
+              path: fdb.cluster
+---
 
 apiVersion: apps/v1
 kind: DaemonSet
diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py
index 96a00ecac..7bc2fa112 100644
--- a/simplyblock_core/utils/__init__.py
+++ b/simplyblock_core/utils/__init__.py
@@ -12,8 +12,12 @@
 import time
 import socket
 from typing import Union, Any, Optional, Tuple
+from docker import DockerClient
 from kubernetes import client, config
-from kubernetes.client import ApiException
+from kubernetes.client import ApiException, V1Deployment, V1DeploymentSpec, V1ObjectMeta, \
+    V1PodTemplateSpec, V1PodSpec, V1Container, V1EnvVar, V1VolumeMount, V1Volume, V1ConfigMapVolumeSource, \
+    V1LabelSelector, V1ResourceRequirements
+
 import docker
 from prettytable import PrettyTable
 from docker.errors import APIError, DockerException, ImageNotFound, NotFound
@@ -2081,3 +2085,91 @@ def patch_prometheus_configmap(username: str, password: str):
     except Exception as e:
         logger.error(f"Unexpected error while patching ConfigMap: {e}")
         return False
+
+
+def create_docker_service(cluster_docker: DockerClient, service_name: str, service_file: str, service_image: str):
+    logger.info(f"Creating service: {service_name}")
+    cluster_docker.services.create(
+        image=service_image,
+        command=service_file,
+        name=service_name,
+        mounts=["/etc/foundationdb:/etc/foundationdb"],
+        env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"],
+        networks=["host"],
+        constraints=["node.role == manager"],
+        labels={
+            "com.docker.stack.image": service_image,
+            "com.docker.stack.namespace": "app"}
+    )
+
+def create_k8s_service(namespace: str, deployment_name: str,
+                       container_name: str, service_file: str, container_image: str):
+
+    logger.info(f"Creating deployment: {deployment_name} in namespace {namespace}")
+    load_kube_config_with_fallback()
+    apps_v1 = client.AppsV1Api()
+
+    env_list = [
+        V1EnvVar(
+            name="SIMPLYBLOCK_LOG_LEVEL",
+            value_from={"config_map_key_ref": {"name": "simplyblock-config", "key": "LOG_LEVEL"}}
+        )
+    ]
+
+    volume_mounts = [
+        V1VolumeMount(
+            name="fdb-cluster-file",
+            mount_path="/etc/foundationdb/fdb.cluster",
+            sub_path="fdb.cluster"
+        )
+    ]
+
+    volumes = [
+        V1Volume(
+            name="fdb-cluster-file",
+            config_map=V1ConfigMapVolumeSource(
+                name="simplyblock-fdb-cluster-config",
+                items=[{"key": "cluster-file", "path": "fdb.cluster"}]
+            )
+        )
+    ]
+
+    container = V1Container(
+        name=container_name,
+        image=container_image,
+        command=["python", service_file], 
+        env=env_list,
+        volume_mounts=volume_mounts,
+        resources=V1ResourceRequirements(
+            requests={"cpu": "200m", "memory": "256Mi"},
+            limits={"cpu": "400m", "memory": "1Gi"}
+        )
+    )
+
+    pod_spec = V1PodSpec(
+        containers=[container],
+        volumes=volumes,
+        host_network=True,
+        dns_policy="ClusterFirstWithHostNet"
+    )
+
+    pod_template = V1PodTemplateSpec(
+        metadata=V1ObjectMeta(labels={"app": deployment_name}),
+        spec=pod_spec
+    )
+
+    deployment_spec = V1DeploymentSpec(
+        replicas=1,
+        selector=V1LabelSelector(match_labels={"app": deployment_name}),
+        template=pod_template
+    )
+
+    deployment = V1Deployment(
+        api_version="apps/v1",
+        kind="Deployment",
+        metadata=V1ObjectMeta(name=deployment_name, namespace=namespace),
+        spec=deployment_spec
+    )
+
+    apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
+    logger.info(f"Deployment {deployment_name} created successfully.")

From a77c5e430cccfafff899c589017fa5eafea62534 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 3 Dec 2025 11:49:23 +0100
Subject: [PATCH 35/68] fix sfam-2507 (#791)

* fix sfam-2507

* refactored code to k8s standard

* fixed failing type check

---------

Co-authored-by: hamdykhader <hamdy.khader@gmail.com>
---
 simplyblock_core/cluster_ops.py               | 41 +++++++++++-----
 .../scripts/charts/templates/app_k8s.yaml     | 11 +++++
 simplyblock_web/api/v1/cluster.py             | 48 +++++++++++++++++++
 simplyblock_web/api/v2/cluster.py             |  6 ++-
 simplyblock_web/auth_middleware.py            |  2 +
 5 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index 5cc9cb3ba..fb43e8022 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -436,18 +436,23 @@ def _run_fio(mount_point) -> None:
 
 def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit,
                 distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, qpair_count,
-                max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric="tcp") -> str:
+                max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric="tcp",
+                cluster_ip=None, grafana_secret=None) -> str:
 
+
+    default_cluster = None
+    monitoring_secret = os.environ.get("MONITORING_SECRET", "")
     clusters = db_controller.get_clusters()
-    if not clusters:
-        raise ValueError("No previous clusters found!")
+    if clusters:
+        default_cluster = clusters[0]
+    else:
+        logger.info("No previous clusters found")
 
     if distr_ndcs == 0 and distr_npcs == 0:
         raise ValueError("both distr_ndcs and distr_npcs cannot be 0")
 
-    monitoring_secret = os.environ.get("MONITORING_SECRET", "")
-    
     logger.info("Adding new cluster")
+
     cluster = Cluster()
     cluster.uuid = str(uuid.uuid4())
     cluster.cluster_name = name
@@ -456,12 +461,27 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn
     cluster.nqn = f"{constants.CLUSTER_NQN}:{cluster.uuid}"
     cluster.secret = utils.generate_string(20)
     cluster.strict_node_anti_affinity = strict_node_anti_affinity
+    if default_cluster:
+        cluster.mode = default_cluster.mode
+        cluster.db_connection = default_cluster.db_connection
+        cluster.grafana_secret = grafana_secret if grafana_secret else default_cluster.grafana_secret
+        cluster.grafana_endpoint = default_cluster.grafana_endpoint
+    else:
+        # creating first cluster on k8s
+        cluster.mode = "kubernetes"
+        logger.info("Retrieving foundationdb connection string...")
+        fdb_cluster_string = utils.get_fdb_cluster_string(constants.FDB_CONFIG_NAME, constants.K8S_NAMESPACE)
+        cluster.db_connection = fdb_cluster_string
+        if monitoring_secret:
+            cluster.grafana_secret = monitoring_secret
+        else:
+            raise Exception("monitoring_secret is required")
+        cluster.grafana_endpoint = "http://simplyblock-grafana"
+        if not cluster_ip:
+            cluster_ip = "0.0.0.0"
 
-    default_cluster = clusters[0]
-    cluster.mode = default_cluster.mode
-    cluster.db_connection = default_cluster.db_connection
-    cluster.grafana_secret = monitoring_secret if default_cluster.mode == "kubernetes" else default_cluster.grafana_secret
-    cluster.grafana_endpoint = default_cluster.grafana_endpoint
+        # add mgmt node object
+        mgmt_node_ops.add_mgmt_node(cluster_ip, "kubernetes", cluster.uuid)
 
     _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret)
 
@@ -491,7 +511,6 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn
     cluster.create_dt = str(datetime.datetime.now())
     cluster.write_to_db(db_controller.kv_store)
     cluster_events.cluster_create(cluster)
-    qos_controller.add_class("Default", 100, cluster.get_id())
 
     return cluster.get_id()
 
diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml
index 49c7490b7..988955a4f 100644
--- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml
+++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml
@@ -93,6 +93,17 @@ spec:
             configMapKeyRef:
               name: simplyblock-config
               key: LOG_LEVEL
+        - name: LVOL_NVMF_PORT_START
+          value: "{{ .Values.ports.lvolNvmfPortStart }}"
+        - name: K8S_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: MONITORING_SECRET
+          valueFrom:
+            secretKeyRef:
+              name: simplyblock-grafana-secrets
+              key: MONITORING_SECRET
         - name: FLASK_DEBUG
           value: "False"
         - name: FLASK_ENV
diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py
index 698d9582d..5c5567694 100644
--- a/simplyblock_web/api/v1/cluster.py
+++ b/simplyblock_web/api/v1/cluster.py
@@ -60,6 +60,54 @@ def add_cluster():
     ))
 
 
+@bp.route('/cluster/create_first', methods=['POST'])
+def create_first_cluster():
+    cl_data = request.get_json()
+
+    if db.get_clusters():
+        return utils.get_response_error("Cluster found!", 400)
+
+    blk_size = 512
+    if 'blk_size' in cl_data:
+        if cl_data['blk_size'] not in [512, 4096]:
+            return utils.get_response_error("blk_size can be 512 or 4096", 400)
+        else:
+            blk_size = cl_data['blk_size']
+    page_size_in_blocks = cl_data.get('distr_ndcs', 2097152)
+    distr_ndcs = cl_data.get('distr_ndcs', 1)
+    distr_npcs = cl_data.get('distr_npcs', 1)
+    distr_bs = cl_data.get('distr_bs', 4096)
+    distr_chunk_bs = cl_data.get('distr_chunk_bs', 4096)
+    ha_type = cl_data.get('ha_type', 'single')
+    enable_node_affinity = cl_data.get('enable_node_affinity', False)
+    qpair_count = cl_data.get('qpair_count', 256)
+    name = cl_data.get('name', None)
+    fabric = cl_data.get('fabric', "tcp")
+    cap_warn = cl_data.get('cap_warn', 0)
+    cap_crit = cl_data.get('cap_crit', 0)
+    prov_cap_warn = cl_data.get('prov_cap_warn', 0)
+    prov_cap_crit = cl_data.get('prov_cap_crit', 0)
+    max_queue_size = cl_data.get('max_queue_size', 128)
+    inflight_io_threshold = cl_data.get('inflight_io_threshold', 4)
+    strict_node_anti_affinity = cl_data.get('strict_node_anti_affinity', False)
+    is_single_node = cl_data.get('is_single_node', False)
+    cluster_ip = cl_data.get('cluster_ip', None)
+    grafana_secret = cl_data.get('grafana_secret', None)
+
+    try:
+        cluster_id = cluster_ops.add_cluster(
+            blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit,
+            distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity,
+            qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric,
+            cluster_ip=cluster_ip, grafana_secret=grafana_secret)
+        if cluster_id:
+            return utils.get_response(db.get_cluster_by_id(cluster_id))
+        else:
+            return utils.get_response(False, "Failed to create cluster", 400)
+    except Exception as e:
+        return utils.get_response(False, str(e), 404)
+
+
 @bp.route('/cluster', methods=['GET'], defaults={'uuid': None})
 @bp.route('/cluster/<string:uuid>', methods=['GET'])
 def list_clusters(uuid):
diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py
index 422766246..8b203bb4a 100644
--- a/simplyblock_web/api/v2/cluster.py
+++ b/simplyblock_web/api/v2/cluster.py
@@ -24,7 +24,7 @@ class _UpdateParams(BaseModel):
 
 
 class ClusterParams(BaseModel):
-    name: Optional[str] = None
+    name: str | None = None
     blk_size: Literal[512, 4096] = 512
     page_size_in_blocks: int = Field(2097152, gt=0)
     cap_warn: util.Percent = 0
@@ -41,6 +41,10 @@ class ClusterParams(BaseModel):
     inflight_io_threshold: int = 4
     enable_node_affinity: bool = False
     strict_node_anti_affinity: bool = False
+    is_single_node: bool = False
+    fabric: str = "tcp"
+    cluster_ip: str | None = None
+    grafana_secret: str | None = None
 
 
 @api.get('/', name='clusters:list')
diff --git a/simplyblock_web/auth_middleware.py b/simplyblock_web/auth_middleware.py
index 8a1a9e83a..70755b46a 100644
--- a/simplyblock_web/auth_middleware.py
+++ b/simplyblock_web/auth_middleware.py
@@ -34,6 +34,8 @@ def decorated(*args: Any, **kwargs: Any) -> ResponseType:
         # Skip authentication for Swagger UI
         if request.method == "GET" and request.path.startswith("/swagger"):
             return cast(ResponseType, f(*args, **kwargs))
+        if request.method == "POST" and request.path.startswith("/cluster/create_first"):
+            return cast(ResponseType, f(*args, **kwargs))
 
         cluster_id: str = ""
         cluster_secret: str = ""

From db2ca62ac91f858217800f0aa58c6a97bff47e7d Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 3 Dec 2025 12:58:37 +0100
Subject: [PATCH 36/68] Update cluster.py (#793)

---
 simplyblock_web/api/v2/cluster.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py
index 8b203bb4a..7834e3f06 100644
--- a/simplyblock_web/api/v2/cluster.py
+++ b/simplyblock_web/api/v2/cluster.py
@@ -24,7 +24,7 @@ class _UpdateParams(BaseModel):
 
 
 class ClusterParams(BaseModel):
-    name: str | None = None
+    name: str = ""
     blk_size: Literal[512, 4096] = 512
     page_size_in_blocks: int = Field(2097152, gt=0)
     cap_warn: util.Percent = 0
@@ -43,9 +43,8 @@ class ClusterParams(BaseModel):
     strict_node_anti_affinity: bool = False
     is_single_node: bool = False
     fabric: str = "tcp"
-    cluster_ip: str | None = None
-    grafana_secret: str | None = None
-
+    cluster_ip: str = ""
+    grafana_secret: str = ""
 
 @api.get('/', name='clusters:list')
 def list() -> List[ClusterDTO]:

From 67455edc4f54d758e7658d34528baec09468185d Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 3 Dec 2025 15:10:01 +0100
Subject: [PATCH 37/68] Update mgmt_node_ops.py (#795)

---
 simplyblock_core/mgmt_node_ops.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/simplyblock_core/mgmt_node_ops.py b/simplyblock_core/mgmt_node_ops.py
index 84375d819..a867e4cbf 100644
--- a/simplyblock_core/mgmt_node_ops.py
+++ b/simplyblock_core/mgmt_node_ops.py
@@ -106,8 +106,6 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo
 
         logger.info(f"Node IP: {dev_ip}")
 
-        hostname = utils.get_node_name_by_ip(dev_ip)
-        utils.label_node_as_mgmt_plane(hostname)
         db_connection = cluster_data['db_connection']
         db_controller = DBController()
         nodes = db_controller.get_mgmt_nodes()
@@ -225,10 +223,9 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo
 
 def add_mgmt_node(mgmt_ip, mode, cluster_id=None):
     db_controller = DBController()
+    hostname = ""
     if mode == "docker":
         hostname = utils.get_hostname()
-    elif mode == "kubernetes":
-        hostname = utils.get_node_name_by_ip(mgmt_ip)
     try:
         node = db_controller.get_mgmt_node_by_hostname(hostname)
         if node:

From 20068bf71648854826dfa6356555f1dad52796d0 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Wed, 3 Dec 2025 15:10:36 +0100
Subject: [PATCH 38/68] remove function get_node_name_by_ip (#794)

---
 simplyblock_core/cluster_ops.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index fb43e8022..33a3b8aab 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -282,9 +282,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass,
         if not dev_ip:
             raise ValueError("Error getting ip: For Kubernetes-based deployments, please supply --mgmt-ip.")
 
-        current_node = utils.get_node_name_by_ip(dev_ip)
-        utils.label_node_as_mgmt_plane(current_node)
-
     if not cli_pass:
         cli_pass = utils.generate_string(10)
 

From e91dbc07518e4cf71d95f028489bc5b7afadde36 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Wed, 3 Dec 2025 19:25:07 +0300
Subject: [PATCH 39/68] Fix /cluster/create_first response (#798)

---
 simplyblock_web/api/v1/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py
index 5c5567694..f4eb2e690 100644
--- a/simplyblock_web/api/v1/cluster.py
+++ b/simplyblock_web/api/v1/cluster.py
@@ -101,7 +101,7 @@ def create_first_cluster():
             qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric,
             cluster_ip=cluster_ip, grafana_secret=grafana_secret)
         if cluster_id:
-            return utils.get_response(db.get_cluster_by_id(cluster_id))
+            return utils.get_response(db.get_cluster_by_id(cluster_id).to_dict())
         else:
             return utils.get_response(False, "Failed to create cluster", 400)
     except Exception as e:

From bc79957f919344161a204a47a4216f515e11b68c Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Thu, 4 Dec 2025 13:46:31 +0300
Subject: [PATCH 40/68] Remove user creation and switch (#799)

---
 docker/Dockerfile | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1e1f8c3bd..c8999b47d 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -27,7 +27,3 @@ RUN if [ -d /usr/share/terminfo ]; then \
        rm -f /usr/share/terminfo/n/ncr260vt300wpp || true ; \
     fi
 
-RUN useradd -u 1001 -r -g 0 -d /app -s /sbin/nologin simplyblock && \
-    chown -R 1001:0 /app
-
-USER 1001

From 43a4caeb6683ebdcf973c9d47768063a063c2bcd Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Thu, 4 Dec 2025 16:11:58 +0300
Subject: [PATCH 41/68] Fix apiv2 pool add response to return pool dict (#800)

---
 simplyblock_web/api/v2/pool.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py
index d34ce0b2a..4ef2c897b 100644
--- a/simplyblock_web/api/v2/pool.py
+++ b/simplyblock_web/api/v2/pool.py
@@ -54,9 +54,8 @@ def add(request: Request, cluster: Cluster, parameters: StoragePoolParams) -> Re
 
     if not id_or_false:
         raise ValueError('Failed to create pool')
-
-    entity_url = request.app.url_path_for('clusters:storage-pools:detail', cluster_id=cluster.get_id(), pool_id=id_or_false)
-    return Response(status_code=201, headers={'Location': entity_url})
+    pool = db.get_pool_by_id(id_or_false)
+    return pool.to_dict()
 
 
 instance_api = APIRouter(prefix='/{pool_id}')

From ce479ebaae71c4f87c02da23cbf85d856389ce23 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Thu, 4 Dec 2025 14:12:43 +0100
Subject: [PATCH 42/68] Update mgmt_node_ops.py (#796)

---
 simplyblock_core/mgmt_node_ops.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/simplyblock_core/mgmt_node_ops.py b/simplyblock_core/mgmt_node_ops.py
index a867e4cbf..6d752a86c 100644
--- a/simplyblock_core/mgmt_node_ops.py
+++ b/simplyblock_core/mgmt_node_ops.py
@@ -112,10 +112,7 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo
         if not nodes:
             logger.error("No mgmt nodes was found in the cluster!")
             return False
-        for node in nodes:
-            if node.hostname == hostname:
-                logger.error("Node already exists in the cluster")
-                return False
+
 
     logger.info("Adding management node object")
     node_id = add_mgmt_node(dev_ip, mode, cluster_id)

From ec075725a8ea439d3ef091bce6fedcd15dee049c Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Thu, 4 Dec 2025 16:14:16 +0300
Subject: [PATCH 43/68] Fix add-node apiv2 to remove unused param
 "full_page_unmap" (#801)

---
 simplyblock_web/api/v2/storage_node.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py
index f93fa5250..b59fadce3 100644
--- a/simplyblock_web/api/v2/storage_node.py
+++ b/simplyblock_web/api/v2/storage_node.py
@@ -37,7 +37,6 @@ class StorageNodeParams(BaseModel):
     test_device: bool = Field(False)
     spdk_image: Optional[str]
     spdk_debug: bool = Field(False)
-    full_page_unmap: bool = Field(False)
     data_nics: List[str] = Field([])
     namespace: str = Field('default')
     jm_percent: util.Percent = Field(3)
@@ -65,7 +64,6 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Re
             'enable_test_device': parameters.test_device,
             'namespace': parameters.namespace,
             'enable_ha_jm': parameters.ha_jm,
-            'full_page_unmap': parameters.full_page_unmap,
         }
     )
     if not task_id_or_false:

From f48c839aba5c454eedf4a491cc3a347b54423d51 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Thu, 4 Dec 2025 16:18:52 +0300
Subject: [PATCH 44/68] Main fix add node apiv2 (#802)

* Fix add-node apiv2 to remove unused param "full_page_unmap"

* Fix optional param initial value for node-add apiv2 "spdk_image"
---
 simplyblock_web/api/v2/storage_node.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py
index b59fadce3..a7a9da7f8 100644
--- a/simplyblock_web/api/v2/storage_node.py
+++ b/simplyblock_web/api/v2/storage_node.py
@@ -35,7 +35,7 @@ class StorageNodeParams(BaseModel):
     max_snapshots: int = Field(500)
     ha_jm: bool = Field(True)
     test_device: bool = Field(False)
-    spdk_image: Optional[str]
+    spdk_image: Optional[str] = Field("")
     spdk_debug: bool = Field(False)
     data_nics: List[str] = Field([])
     namespace: str = Field('default')

From f673f0ad30e209641674a0745e9bcc09ae9194fd Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Thu, 4 Dec 2025 15:37:50 +0100
Subject: [PATCH 45/68] Update cluster_ops.py (#797)

* Update cluster_ops.py

* Update cluster_ops.py

* add grafana port 3000 to url

* removed debug log message
---
 simplyblock_core/cluster_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index 33a3b8aab..5e6352cc0 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -473,15 +473,15 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn
             cluster.grafana_secret = monitoring_secret
         else:
             raise Exception("monitoring_secret is required")
-        cluster.grafana_endpoint = "http://simplyblock-grafana"
+        cluster.grafana_endpoint = "http://simplyblock-grafana:3000"
         if not cluster_ip:
             cluster_ip = "0.0.0.0"
 
         # add mgmt node object
         mgmt_node_ops.add_mgmt_node(cluster_ip, "kubernetes", cluster.uuid)
-
+                   
     _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret)
-
+                    
     cluster.distr_ndcs = distr_ndcs
     cluster.distr_npcs = distr_npcs
     cluster.distr_bs = distr_bs

From d2ad9737a2d17d62f79ba4c31809fd9a72b2d9fe Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Thu, 4 Dec 2025 18:35:50 +0300
Subject: [PATCH 46/68] Fix node-add apiv2 response (#803)

---
 simplyblock_web/api/v2/storage_node.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py
index a7a9da7f8..ab0a3f827 100644
--- a/simplyblock_web/api/v2/storage_node.py
+++ b/simplyblock_web/api/v2/storage_node.py
@@ -68,9 +68,7 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Re
     )
     if not task_id_or_false:
         raise ValueError('Failed to create add-node task')
-
-    task_url = request.app.url_path_for('clusters:storage-nodes:detail', cluster_id=cluster.get_id(), task_id=task_id_or_false)
-    return Response(status_code=201, headers={'Location': task_url})
+    return task_id_or_false
 
 
 instance_api = APIRouter(prefix='/{storage_node_id}')

From 1b0d7aa4a40d6a7190f3ac9413577cb115c97a5d Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Thu, 4 Dec 2025 18:41:09 +0300
Subject: [PATCH 47/68] Main fix node list apiv2 response (#804)

* Fix node-add apiv2 response

* Fix sn list apiv2 response
---
 simplyblock_web/api/v2/storage_node.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py
index ab0a3f827..d1aec59be 100644
--- a/simplyblock_web/api/v2/storage_node.py
+++ b/simplyblock_web/api/v2/storage_node.py
@@ -21,9 +21,9 @@
 
 
 @api.get('/', name='clusters:storage-nodes:list')
-def list(cluster: Cluster) -> List[StorageNodeDTO]:
+def list(cluster: Cluster) -> List[dict]:
     return [
-        StorageNodeDTO.from_model(storage_node)
+        storage_node.to_dict()
         for storage_node
         in db.get_storage_nodes_by_cluster_id(cluster.get_id())
     ]
@@ -46,7 +46,7 @@ class StorageNodeParams(BaseModel):
 
 
 @api.post('/', name='clusters:storage-nodes:create', status_code=201, responses={201: {"content": None}})
-def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Response:
+def add(request: Request, cluster: Cluster, parameters: StorageNodeParams):
     task_id_or_false = tasks_controller.add_node_add_task(
         cluster.get_id(),
         {

From e0cd5abcd9a99500bd88ee50bfae426e8218a8c7 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Thu, 4 Dec 2025 17:11:27 +0100
Subject: [PATCH 48/68] Update storage_deploy_spdk.yaml.j2 (#805)

---
 simplyblock_web/templates/storage_deploy_spdk.yaml.j2 | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
index e49aca2e2..81f1e1eda 100644
--- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
+++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
@@ -87,16 +87,6 @@ spec:
           value: "{{ TOTAL_HP }}"
         - name: RPC_PORT
           value: "{{ RPC_PORT }}"
-        - name: SPDKCSI_SECRET
-          valueFrom:
-            secretKeyRef:
-              name: simplyblock-csi-secret
-              key: secret.json
-        - name: CLUSTER_CONFIG
-          valueFrom:
-            configMapKeyRef:
-              name: simplyblock-csi-cm
-              key: config.json
       lifecycle:
         postStart:
           exec:

From 7d8bb865e1905a6dea9ddf82bf855c94998c101f Mon Sep 17 00:00:00 2001
From: Raunak Jalan <41023976+RaunakJalan@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:53:14 +0530
Subject: [PATCH 49/68] Adding quick outage case, changes to ssh utils (#806)

Co-authored-by: root <root@vm22.simplyblock2.localdomain>
---
 e2e/__init__.py                               |    8 +-
 e2e/continuous_log_collector.py               |    4 +-
 e2e/e2e_tests/cluster_test_base.py            |   10 +-
 e2e/e2e_tests/single_node_multi_fio_perf.py   |    9 +-
 .../continuous_failover_ha_multi_client.py    |  131 +-
 ...s_failover_ha_multi_client_quick_outage.py |  535 +++++++
 .../continuous_failover_ha_multi_outage.py    |  398 ++++-
 e2e/utils/ssh_utils.py                        | 1389 ++++++++++++-----
 8 files changed, 1974 insertions(+), 510 deletions(-)
 create mode 100644 e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py

diff --git a/e2e/__init__.py b/e2e/__init__.py
index e8cae33f7..31164238e 100644
--- a/e2e/__init__.py
+++ b/e2e/__init__.py
@@ -55,6 +55,7 @@
 from stress_test.continuous_failover_ha_geomtery import RandomMultiGeometryFailoverTest
 from stress_test.continuous_failover_ha_2node import RandomMultiClient2NodeFailoverTest
 from stress_test.continuous_failover_ha_rdma import RandomRDMAFailoverTest
+from stress_test.continuous_failover_ha_multi_client_quick_outage import RandomRapidFailoverNoGap
 
 
 from e2e_tests.upgrade_tests.major_upgrade import TestMajorUpgrade
@@ -96,8 +97,8 @@ def get_all_tests(custom=True, ha_test=False):
         TestLvolFioNpcs0,
         TestLvolFioNpcs1,
         TestLvolFioNpcs2,
-        TestLvolFioQOSBW,
-        TestLvolFioQOSIOPS,
+        # TestLvolFioQOSBW,
+        # TestLvolFioQOSIOPS,
         TestSingleNodeOutage,
         # TestSingleNodeReboot,
         # TestHASingleNodeReboot,
@@ -147,6 +148,7 @@ def get_stress_tests():
         RandomMultiGeometryFailoverTest,
         RandomMultiClient2NodeFailoverTest,
         RandomRDMAFailoverTest,
+        RandomRapidFailoverNoGap,
     ]
     return tests
 
@@ -161,4 +163,4 @@ def get_load_tests():
     tests = [
         TestLvolOutageLoadTest
     ]
-    return tests
\ No newline at end of file
+    return tests
diff --git a/e2e/continuous_log_collector.py b/e2e/continuous_log_collector.py
index 48f06fd80..96b157760 100644
--- a/e2e/continuous_log_collector.py
+++ b/e2e/continuous_log_collector.py
@@ -22,7 +22,7 @@ def __init__(self,docker_logs_path=None):
 
     def get_log_directory(self):
         timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
-        return os.path.join(Path.home(), "container-logs", f"manual-logs-{timestamp}")
+        return os.path.join('/mnt/nfs_share/', f"snapshot-repliction-from-replicated-clone-{timestamp}")
 
     def collect_logs(self, test_name):
         all_nodes = set()
@@ -75,4 +75,4 @@ def collect_logs(self, test_name):
 
 if __name__ == "__main__":
     collector = ContinuousLogCollector()
-    collector.collect_logs(test_name="Manual")
+    collector.collect_logs(test_name="snapshot-repliction-from-replicated-clone")
diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py
index 5077544b0..15743725b 100644
--- a/e2e/e2e_tests/cluster_test_base.py
+++ b/e2e/e2e_tests/cluster_test_base.py
@@ -401,13 +401,17 @@ def collect_management_details(self, post_teardown=False):
             cmd = f"{self.base_cmd} sn check {result['uuid']} >& {base_path}/node{node}_check{suffix}.txt"
             self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd)
 
+            cmd = f"{self.base_cmd} sn get {result['uuid']} >& {base_path}/node{node}_get{suffix}.txt"
+            self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd)
+
             node+=1
-        for node in self.fio_node:
+        all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines:
+        for node in all_nodes:
             base_path = os.path.join(self.docker_logs_path, node)
-            cmd = f"journalctl -k >& {base_path}/jounalctl_{node}.txt"
+            cmd = f"journalctl -k --no-tail >& {base_path}/jounalctl_{node}-final.txt"
 
             self.ssh_obj.exec_command(node, cmd)
-            cmd = f"dmesg -T >& {base_path}/dmesg_{node}.txt"
+            cmd = f"dmesg -T >& {base_path}/dmesg_{node}-final.txt"
             self.ssh_obj.exec_command(node, cmd)
             
     def teardown(self, delete_lvols=True, close_ssh=True):
diff --git a/e2e/e2e_tests/single_node_multi_fio_perf.py b/e2e/e2e_tests/single_node_multi_fio_perf.py
index 86a75c4d5..681cc1742 100644
--- a/e2e/e2e_tests/single_node_multi_fio_perf.py
+++ b/e2e/e2e_tests/single_node_multi_fio_perf.py
@@ -187,10 +187,11 @@ def cleanup_lvols(self, lvol_configs):
         self.logger.info("Starting cleanup of LVOLs")
         for config in lvol_configs:
             lvol_name = config['lvol_name']
-            self.ssh_obj.unmount_path(node=self.client_machines[0],
-                                      device=self.lvol_devices[lvol_name]['MountPath'])
-            self.ssh_obj.remove_dir(node=self.client_machines[0], 
-                                    dir_path=self.lvol_devices[lvol_name]['MountPath'])
+            if config['mount']:
+                self.ssh_obj.unmount_path(node=self.client_machines[0],
+                                          device=self.lvol_devices[lvol_name]['MountPath'])
+                self.ssh_obj.remove_dir(node=self.client_machines[0], 
+                                        dir_path=self.lvol_devices[lvol_name]['MountPath'])
             lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name)
             subsystems = self.ssh_obj.get_nvme_subsystems(node=self.client_machines[0], 
                                                           nqn_filter=lvol_id)
diff --git a/e2e/stress_test/continuous_failover_ha_multi_client.py b/e2e/stress_test/continuous_failover_ha_multi_client.py
index a2869482d..a97c42676 100644
--- a/e2e/stress_test/continuous_failover_ha_multi_client.py
+++ b/e2e/stress_test/continuous_failover_ha_multi_client.py
@@ -42,6 +42,7 @@ def __init__(self, **kwargs):
         self.sn_nodes = []
         self.current_outage_node = None
         self.snapshot_names = []
+        self.current_outage_nodes = []
         self.disconnect_thread = None
         self.outage_start_time = None
         self.outage_end_time = None
@@ -60,8 +61,7 @@ def __init__(self, **kwargs):
         # self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt",
         #                      "interface_partial_network_interrupt",
         #                      "partial_nw"]
-        self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt",
-                             "interface_partial_network_interrupt"]
+        self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt"]
         # self.outage_types = ["partial_nw"]
         self.blocked_ports = None
         self.outage_log_file = os.path.join("logs", f"outage_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
@@ -111,7 +111,26 @@ def create_lvols_with_fio(self, count):
                 lvol_name = f"{self.lvol_name}_{i}" if not is_crypto else f"c{self.lvol_name}_{i}"
             self.logger.info(f"Creating lvol with Name: {lvol_name}, fs type: {fs_type}, crypto: {is_crypto}")
             try:
-                if self.current_outage_node:
+                self.logger.info(f"Current Outage Node: {self.current_outage_nodes}")
+                if self.current_outage_nodes:
+                    self.logger.info(f"Primary vs secondary: {self.sn_primary_secondary_map}")
+                    skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] in self.current_outage_nodes]
+                    self.logger.info(f"Skip Nodes: {skip_nodes}")
+                    for node in self.current_outage_nodes:
+                        skip_nodes.append(node)
+                    self.logger.info(f"Skip Nodes: {skip_nodes}")
+                    self.logger.info(f"Storage Nodes with sec: {self.sn_nodes_with_sec}")
+                    host_id = [node for node in self.sn_nodes_with_sec if node not in skip_nodes]
+                    self.sbcli_utils.add_lvol(
+                        lvol_name=lvol_name,
+                        pool_name=self.pool_name,
+                        size=self.lvol_size,
+                        crypto=is_crypto,
+                        key1=self.lvol_crypt_keys[0],
+                        key2=self.lvol_crypt_keys[1],
+                        host_id=host_id[0]
+                    )
+                elif self.current_outage_node:
                     skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] == self.current_outage_node]
                     skip_nodes.append(self.current_outage_node)
                     skip_nodes.append(self.sn_primary_secondary_map[self.current_outage_node])
@@ -276,7 +295,7 @@ def create_lvols_with_fio(self, count):
                     "iodepth": 1,
                     "numjobs": 5,
                     "time_based": True,
-                    "runtime": 2000,
+                    "runtime": 3000,
                     "log_avg_msec": 1000,
                     "iolog_file": self.lvol_mount_details[lvol_name]["iolog_base_path"],
                 },
@@ -306,11 +325,11 @@ def perform_random_outage(self):
         node_ip = node_details[0]["mgmt_ip"]
         node_rpc_port = node_details[0]["rpc_port"]
 
-        sleep_n_sec(120)
+        sleep_n_sec(5)
         for node in self.sn_nodes_with_sec:
-            self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
-                                      storage_node_id=node)
-        
+            # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
+            #                          storage_node_id=node)
+            self.logger.info(f"Skipping lvstore dump!!")
         for node in self.sn_nodes_with_sec:
             cur_node_details = self.sbcli_utils.get_storage_node_details(node)
             cur_node_ip = cur_node_details[0]["mgmt_ip"]
@@ -417,7 +436,7 @@ def perform_random_outage(self):
             
             self.disconnect_thread = threading.Thread(
                 target=self.ssh_obj.disconnect_all_active_interfaces,
-                args=(node_ip, active_interfaces, 600),
+                args=(node_ip, active_interfaces, 300),
             )
             self.disconnect_thread.start()
         elif outage_type == "interface_partial_network_interrupt":
@@ -430,7 +449,7 @@ def perform_random_outage(self):
             
             self.disconnect_thread = threading.Thread(
                 target=self.ssh_obj.disconnect_all_active_interfaces,
-                args=(node_ip, active_interfaces, 600),
+                args=(node_ip, active_interfaces, 300),
             )
             self.disconnect_thread.start()
         elif outage_type == "partial_nw":
@@ -478,12 +497,12 @@ def perform_random_outage(self):
                 self.ssh_obj.disconnect_lvol_node_device(node=self.lvol_mount_details[lvol]["Client"], device=self.lvol_mount_details[lvol]["Device"])
             
         if outage_type != "partial_nw" or outage_type != "partial_nw_single_port":
-            sleep_n_sec(120)
+            sleep_n_sec(10)
         
         return outage_type
     
     
-    def restart_nodes_after_failover(self, outage_type):
+    def restart_nodes_after_failover(self, outage_type, restart=False):
         """Perform steps for node restart."""
         node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
         node_ip = node_details[0]["mgmt_ip"]
@@ -543,14 +562,48 @@ def restart_nodes_after_failover(self, outage_type):
                 self.ssh_obj.exec_command(node=self.lvol_mount_details[lvol]["Client"], command=connect)
         
         elif outage_type == "container_stop":
-            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
-            # Log the restart event
-            self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=1)
+            if restart:
+                max_retries = 10
+                retry_delay = 10  # seconds
+
+                # Retry mechanism for restarting the node
+                for attempt in range(max_retries):
+                    try:
+                        force=False
+                        if attempt == max_retries - 1:
+                            force=True
+                            self.logger.info("[CHECK] Restarting Node via CLI with Force flag as via API Fails.")
+                        else:
+                            self.logger.info("[CHECK] Restarting Node via CLI as via API Fails.")
+                        self.ssh_obj.restart_node(node=self.mgmt_nodes[0],
+                                                node_id=self.current_outage_node,
+                                                force=force)
+                        # else:
+                        #     self.sbcli_utils.restart_node(node_uuid=self.current_outage_node, expected_error_code=[503])
+                        self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
+                        break  # Exit loop if successful
+                    except Exception as _:
+                        if attempt < max_retries - 2:
+                            self.logger.info(f"Attempt {attempt + 1} failed to restart node. Retrying in {retry_delay} seconds...")
+                            sleep_n_sec(retry_delay)
+                        elif attempt < max_retries - 1:
+                            self.logger.info(f"Attempt {attempt + 1} failed to restart node via API. Retrying in {retry_delay} seconds via CMD...")
+                            sleep_n_sec(retry_delay)
+                        else:
+                            self.logger.info("Max retries reached. Failed to restart node.")
+                            raise  # Rethrow the last exception
+                self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
+                # Log the restart event
+                self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=0)
+            else:
+                self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
+                # Log the restart event
+                self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=2)
 
         elif "network_interrupt" in outage_type:
             self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
             # Log the restart event
-            self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=11)
+            self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=6)
         
         if not self.k8s_test:
             for node in self.storage_nodes:
@@ -608,9 +661,9 @@ def restart_nodes_after_failover(self, outage_type):
             # sleep_n_sec(30)
 
         for node in self.sn_nodes_with_sec:
-            self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
-                                      storage_node_id=node)
-
+            # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
+            #                          storage_node_id=node)
+            self.logger.info(f"Skipping lvstore dump!!")
 
     def create_snapshots_and_clones(self):
         """Create snapshots and clones during an outage."""
@@ -777,7 +830,7 @@ def create_snapshots_and_clones(self):
                     "iodepth": 1,
                     "numjobs": 5,
                     "time_based": True,
-                    "runtime": 2000,
+                    "runtime": 3000,
                     "log_avg_msec": 1000,
                     "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"],
                 },
@@ -786,22 +839,23 @@ def create_snapshots_and_clones(self):
             self.fio_threads.append(fio_thread)
             self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}.")
 
-            self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"],
-                                         new_size=f"{self.int_lvol_size}G")
+            if self.lvol_mount_details[lvol]["ID"]:
+                self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"],
+                                             new_size=f"{self.int_lvol_size}G")
             sleep_n_sec(10)
-            self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"],
-                                         new_size=f"{self.int_lvol_size}G")
-            
+            if self.clone_mount_details[clone_name]["ID"]:
+                self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"],
+                                             new_size=f"{self.int_lvol_size}G")
+
 
     def delete_random_lvols(self, count):
         """Delete random lvols during an outage."""
         skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] == self.current_outage_node]
         skip_nodes.append(self.current_outage_node)
         skip_nodes.append(self.sn_primary_secondary_map[self.current_outage_node])
-        skip_nodes_lvol = []
-        self.logger.info(f"Skipping Nodes: {skip_nodes_lvol}")
+        self.logger.info(f"Skipping Nodes: {skip_nodes}")
         available_lvols = [
-            lvol for node, lvols in self.node_vs_lvol.items() if node not in skip_nodes_lvol for lvol in lvols
+            lvol for node, lvols in self.node_vs_lvol.items() if node not in skip_nodes for lvol in lvols
         ]
         self.logger.info(f"Available Lvols: {available_lvols}")
         if len(available_lvols) < count:
@@ -922,7 +976,7 @@ def perform_failover_during_outage(self):
                     storage_node_id=node,
                     logs_path=self.docker_logs_path
                 )
-            self.create_lvols_with_fio(3)
+            self.create_lvols_with_fio(5)
             if not self.k8s_test:
                 for node in self.storage_nodes:
                     self.ssh_obj.restart_docker_logging(
@@ -1041,7 +1095,7 @@ def restart_fio(self, iteration):
                     "iodepth": 1,
                     "numjobs": 5,
                     "time_based": True,
-                    "runtime": 2000,
+                    "runtime": 3000,
                     "log_avg_msec": 1000,
                     "iolog_file": self.lvol_mount_details[lvol]["iolog_base_path"],
                 },
@@ -1150,7 +1204,7 @@ def run(self):
                         storage_node_id=node,
                         logs_path=self.docker_logs_path
                     )
-                self.create_lvols_with_fio(5)
+                self.create_lvols_with_fio(3)
                 if not self.k8s_test:
                     for node in self.storage_nodes:
                         self.ssh_obj.restart_docker_logging(
@@ -1175,7 +1229,7 @@ def run(self):
             else:
                 self.logger.info(f"Current outage node: {self.current_outage_node} is secondary node. Skipping delete and create")
             if outage_type != "partial_nw" or outage_type != "partial_nw_single_port":
-                sleep_n_sec(280)
+                sleep_n_sec(100)
             for node in self.sn_nodes_with_sec:
                 cur_node_details = self.sbcli_utils.get_storage_node_details(node)
                 cur_node_ip = cur_node_details[0]["mgmt_ip"]
@@ -1195,7 +1249,7 @@ def run(self):
                 )
             self.logger.info("Waiting for fallback.")
             if outage_type != "partial_nw" or outage_type != "partial_nw_single_port":
-                sleep_n_sec(100)
+                sleep_n_sec(15)
             time_duration = self.common_utils.calculate_time_duration(
                 start_timestamp=self.outage_start_time,
                 end_timestamp=self.outage_end_time
@@ -1213,23 +1267,24 @@ def run(self):
             no_task_ok = outage_type in {"partial_nw", "partial_nw_single_port", "lvol_disconnect_primary"}
             if not self.sbcli_utils.is_secondary_node(self.current_outage_node):
                 self.validate_migration_for_node(self.outage_start_time, 2000, None, 60, no_task_ok=no_task_ok)
+                # pass
 
             for clone, clone_details in self.clone_mount_details.items():
                 self.common_utils.validate_fio_test(clone_details["Client"],
                                                     log_file=clone_details["Log"])
-                # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"])
-                # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"])
+                self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"])
+                self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"])
             
             for lvol, lvol_details in self.lvol_mount_details.items():
                 self.common_utils.validate_fio_test(lvol_details["Client"],
                                                     log_file=lvol_details["Log"])
-                # self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"])
-                # self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"])
+                self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"])
+                self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"])
 
             # Perform failover and manage resources during outage
             outage_type = self.perform_failover_during_outage()
             if outage_type != "partial_nw" or outage_type != "partial_nw_single_port":
-                sleep_n_sec(100)
+                sleep_n_sec(15)
             time_duration = self.common_utils.calculate_time_duration(
                 start_timestamp=self.outage_start_time,
                 end_timestamp=self.outage_end_time
diff --git a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py
new file mode 100644
index 000000000..afa98b055
--- /dev/null
+++ b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py
@@ -0,0 +1,535 @@
+# stress_test/continuous_failover_ha_multi_client_quick_outage.py
+# Fast outages with long-running FIO, no churn beyond initial setup.
+# - Create lvols, snapshots, clones ONCE at the beginning
+# - Start 30min FIO on all mounts (lvols + clones)
+# - Run fast outages (as soon as node is ONLINE again)
+# - Every 5 outages: wait for all FIO to complete, validate, then (optionally) wait for migration window
+# - Graceful shutdown: suspend -> wait SUSPENDED -> shutdown -> wait OFFLINE -> keep offline 5 min -> restart
+# - After any restart: 15–30s idle then immediately next outage
+
+import os
+import random
+import string
+import threading
+from datetime import datetime
+from utils.common_utils import sleep_n_sec
+from exceptions.custom_exception import LvolNotConnectException
+from stress_test.lvol_ha_stress_fio import TestLvolHACluster
+
+
+def _rand_id(n=15, first_alpha=True):
+    letters = string.ascii_uppercase
+    digits = string.digits
+    allc = letters + digits
+    if first_alpha:
+        return random.choice(letters) + ''.join(random.choices(allc, k=n-1))
+    return ''.join(random.choices(allc, k=n))
+
+
+class RandomRapidFailoverNoGap(TestLvolHACluster):
+    """
+    - Minimal churn (only bootstrap creates)
+    - Long FIO (30 mins) on every lvol/clone
+    - Outage pacing: next outage right after ONLINE; add 15–30s buffer post-restart
+    - Validate FIO and pause for migration every 5 outages
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # Base knobs
+        self.total_lvols = 20
+        self.lvol_size = "40G"
+        self.fio_size = "15G"
+
+        # Validation cadence & FIO runtime
+        self.validate_every = 5
+        self._iter = 0
+        self._per_wave_fio_runtime = 3600      # 60 minutes
+        self._fio_wait_timeout = 5000          # wait for all to finish
+
+        # Internal state
+        self.fio_threads = []
+        self.lvol_mount_details = {}
+        self.clone_mount_details = {}
+        self.sn_nodes = []
+        self.sn_nodes_with_sec = []
+        self.sn_primary_secondary_map = {}
+        self.node_vs_lvol = {}
+        self.snapshot_names = []
+        self.snap_vs_node = {}
+        self.current_outage_node = None
+        self.outage_start_time = None
+        self.outage_end_time = None
+        self.first_outage_ts = None            # track the first outage for migration window
+        self.test_name = "longfio_nochurn_rapid_outages"
+
+        self.outage_types = [
+            "graceful_shutdown",
+            "container_stop",
+            # "interface_full_network_interrupt",
+        ]
+
+        # Names
+        self.lvol_base = f"lvl{_rand_id(12)}"
+        self.clone_base = f"cln{_rand_id(12)}"
+        self.snap_base = f"snap{_rand_id(12)}"
+
+        # Logging file for outages
+        self.outage_log_file = os.path.join("logs", f"outage_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+        self._init_outage_log()
+
+    # ---------- small utilities ----------
+
+    def _init_outage_log(self):
+        os.makedirs(os.path.dirname(self.outage_log_file), exist_ok=True)
+        with open(self.outage_log_file, "w") as f:
+            f.write("Timestamp,Node,Outage_Type,Event\n")
+
+    def _log_outage_event(self, node, outage_type, event):
+        ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        with open(self.outage_log_file, "a") as f:
+            f.write(f"{ts},{node},{outage_type},{event}\n")
+
+    def _short_bs(self):
+        # return f"{2 ** random.randint(2, 7)}K"  # 4K–128K
+        return f"{2 ** 6}K"
+
+    def _pick_outage(self):
+        random.shuffle(self.outage_types)
+        return self.outage_types[0]
+
+    # ---------- cluster bootstrap ----------
+
+    def _wait_cluster_active(self, timeout=900, poll=5):
+        """
+        Poll `sbctl cluster list` until status ACTIVE.
+        Avoids 400 in_activation when creating lvol/snap/clone during bring-up.
+        """
+        end = datetime.now().timestamp() + timeout
+        while datetime.now().timestamp() < end:
+            try:
+                info = self.ssh_obj.cluster_list(self.mgmt_nodes[0], self.cluster_id)  # must wrap "sbctl cluster list"
+                self.logger.info(info)
+                # Expect a single row with Status
+                status = str(info).upper()
+                if "ACTIVE" in status:
+                    return
+            except Exception as e:
+                self.logger.info(f"ERROR: {e}")
+            sleep_n_sec(poll)
+        raise RuntimeError("Cluster did not become ACTIVE within timeout")
+
+    def _bootstrap_cluster(self):
+        # Ensure Cluster is ACTIVE
+        self._wait_cluster_active()
+
+        # create pool
+        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+
+        # discover storage nodes
+        storage_nodes = self.sbcli_utils.get_storage_nodes()
+        for res in storage_nodes['results']:
+            self.sn_nodes.append(res["uuid"])
+            self.sn_nodes_with_sec.append(res["uuid"])
+            self.sn_primary_secondary_map[res["uuid"]] = res["secondary_node_id"]
+        
+        self.logger.info(f"[LFNG] SN sec map: {self.sn_primary_secondary_map}")
+
+        # initial lvols + mount + then later clone from snapshots
+        self._create_lvols(count=self.total_lvols)  # start_fio=False → we launch after clones
+        self._seed_snapshots_and_clones()           # also mounts clones
+
+        # Start 30 min FIO on all (lvols + clones)
+        self._kick_fio_for_all(runtime=self._per_wave_fio_runtime)
+
+        # start container logs
+        if not self.k8s_test:
+            for node in self.storage_nodes:
+                self.ssh_obj.restart_docker_logging(
+                    node_ip=node,
+                    containers=self.container_nodes[node],
+                    log_dir=os.path.join(self.docker_logs_path, node),
+                    test_name=self.test_name
+                )
+        else:
+            self.runner_k8s_log.restart_logging()
+
+    # ---------- lvol / fio helpers ----------
+
+    def _create_lvols(self, count=1):
+        for _ in range(count):
+            fs_type = random.choice(["ext4", "xfs"])
+            is_crypto = random.choice([True, False])
+            name_core = f"{self.lvol_base}_{_rand_id(6, first_alpha=False)}"
+            lvol_name = name_core if not is_crypto else f"c{name_core}"
+
+            kwargs = dict(
+                lvol_name=lvol_name,
+                pool_name=self.pool_name,
+                size=self.lvol_size,
+                crypto=is_crypto,
+                key1=self.lvol_crypt_keys[0],
+                key2=self.lvol_crypt_keys[1],
+            )
+
+            # Avoid outage node & partner during initial placement
+            if self.current_outage_node:
+                skip_nodes = [self.current_outage_node, self.sn_primary_secondary_map.get(self.current_outage_node)]
+                skip_nodes += [p for p, s in self.sn_primary_secondary_map.items() if s == self.current_outage_node]
+                host_id = [n for n in self.sn_nodes_with_sec if n not in skip_nodes]
+                if host_id:
+                    kwargs["host_id"] = host_id[0]
+
+            # Ensure cluster ACTIVE before creating
+            self._wait_cluster_active()
+
+            try:
+                self.sbcli_utils.add_lvol(**kwargs)
+            except Exception as e:
+                self.logger.warning(f"[LFNG] lvol create failed ({lvol_name}) → {e}; retry once after ACTIVE gate")
+                self._wait_cluster_active()
+                self.sbcli_utils.add_lvol(**kwargs)
+
+            # record
+            lvol_id = self.sbcli_utils.get_lvol_id(lvol_name)
+            self.lvol_mount_details[lvol_name] = {
+                "ID": lvol_id,
+                "Command": None,
+                "Mount": None,
+                "Device": None,
+                "MD5": None,
+                "FS": fs_type,
+                "Log": f"{self.log_path}/{lvol_name}.log",
+                "snapshots": [],
+                "iolog_base_path": f"{self.log_path}/{lvol_name}_fio_iolog",
+            }
+
+            # refresh list
+            self.ssh_obj.exec_command(node=self.mgmt_nodes[0], command=f"{self.base_cmd} lvol list", supress_logs=True)
+
+            # track node placement
+            lvol_node_id = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)[0]["node_id"]
+            self.node_vs_lvol.setdefault(lvol_node_id, []).append(lvol_name)
+
+            # connect
+            connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=lvol_name)
+            self.lvol_mount_details[lvol_name]["Command"] = connect_ls
+
+            client_node = random.choice(self.fio_node)
+            self.lvol_mount_details[lvol_name]["Client"] = client_node
+
+            initial = self.ssh_obj.get_devices(node=client_node)
+            for c in connect_ls:
+                _, err = self.ssh_obj.exec_command(node=client_node, command=c)
+                if err:
+                    nqn = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)[0]["nqn"]
+                    self.ssh_obj.disconnect_nvme(node=client_node, nqn_grep=nqn)
+                    self.logger.info(f"[LFNG] connect error → clean lvol {lvol_name}")
+                    self.sbcli_utils.delete_lvol(lvol_name=lvol_name, max_attempt=20, skip_error=True)
+                    sleep_n_sec(3)
+                    del self.lvol_mount_details[lvol_name]
+                    self.node_vs_lvol[lvol_node_id].remove(lvol_name)
+                    break
+
+            final = self.ssh_obj.get_devices(node=client_node)
+            new_dev = None
+            for d in final:
+                if d not in initial:
+                    new_dev = f"/dev/{d.strip()}"
+                    break
+            if not new_dev:
+                raise LvolNotConnectException("LVOL did not connect")
+
+            self.lvol_mount_details[lvol_name]["Device"] = new_dev
+            self.ssh_obj.format_disk(node=client_node, device=new_dev, fs_type=fs_type)
+
+            mnt = f"{self.mount_path}/{lvol_name}"
+            self.ssh_obj.mount_path(node=client_node, device=new_dev, mount_path=mnt)
+            self.lvol_mount_details[lvol_name]["Mount"] = mnt
+
+            # clean old logs
+            self.ssh_obj.delete_files(client_node, [
+                f"{mnt}/*fio*",
+                f"{self.log_path}/local-{lvol_name}_fio*",
+                f"{self.log_path}/{lvol_name}_fio_iolog*"
+            ])
+
+    def _seed_snapshots_and_clones(self):
+        """Create one snapshot and one clone per lvol (best effort). Mount clones on same client."""
+        for lvol, det in list(self.lvol_mount_details.items()):
+            # Ensure ACTIVE
+            self._wait_cluster_active()
+
+            snap_name = f"{self.snap_base}_{_rand_id(8, first_alpha=False)}"
+            out, err = self.ssh_obj.add_snapshot(self.mgmt_nodes[0], det["ID"], snap_name)
+            if "(False," in str(out) or "(False," in str(err):
+                self.logger.warning(f"[LFNG] snapshot create failed for {lvol} → skip clone")
+                continue
+
+            self.snapshot_names.append(snap_name)
+            node_id = self.sbcli_utils.get_lvol_details(lvol_id=det["ID"])[0]["node_id"]
+            self.snap_vs_node[snap_name] = node_id
+            det["snapshots"].append(snap_name)
+
+            snap_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snap_name)
+            clone_name = f"{self.clone_base}_{_rand_id(8, first_alpha=False)}"
+            try:
+                self.ssh_obj.add_clone(self.mgmt_nodes[0], snap_id, clone_name)
+            except Exception as e:
+                self.logger.warning(f"[LFNG] clone create failed for {lvol} → {e}")
+                continue
+
+            # connect clone
+            fs_type = det["FS"]
+            client = det["Client"]
+
+            self.clone_mount_details[clone_name] = {
+                "ID": self.sbcli_utils.get_lvol_id(clone_name),
+                "Command": None,
+                "Mount": None,
+                "Device": None,
+                "MD5": None,
+                "FS": fs_type,
+                "Log": f"{self.log_path}/{clone_name}.log",
+                "snapshot": snap_name,
+                "Client": client,
+                "iolog_base_path": f"{self.log_path}/{clone_name}_fio_iolog",
+            }
+
+            connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=clone_name)
+            self.clone_mount_details[clone_name]["Command"] = connect_ls
+
+            initial = self.ssh_obj.get_devices(node=client)
+            for c in connect_ls:
+                _, err = self.ssh_obj.exec_command(node=client, command=c)
+                if err:
+                    nqn = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])[0]["nqn"]
+                    self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn)
+                    self.logger.info(f"[LFNG] connect clone error → cleanup")
+                    self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True)
+                    sleep_n_sec(3)
+                    del self.clone_mount_details[clone_name]
+                    continue
+
+            final = self.ssh_obj.get_devices(node=client)
+            new_dev = None
+            for d in final:
+                if d not in initial:
+                    new_dev = f"/dev/{d.strip()}"
+                    break
+            if not new_dev:
+                raise LvolNotConnectException("Clone did not connect")
+
+            self.clone_mount_details[clone_name]["Device"] = new_dev
+            if fs_type == "xfs":
+                self.ssh_obj.clone_mount_gen_uuid(client, new_dev)
+            mnt = f"{self.mount_path}/{clone_name}"
+            self.ssh_obj.mount_path(node=client, device=new_dev, mount_path=mnt)
+            self.clone_mount_details[clone_name]["Mount"] = mnt
+
+            # purge old logs
+            self.ssh_obj.delete_files(client, [
+                f"{self.log_path}/local-{clone_name}_fio*",
+                f"{self.log_path}/{clone_name}_fio_iolog*",
+                f"{mnt}/*fio*"
+            ])
+
+    def _kick_fio_for_all(self, runtime=None):
+        """Start verified fio (PID-checked; auto-rerun) for all lvols + clones."""
+        # small stagger to avoid SSH bursts
+        def _launch(name, det):
+            self.ssh_obj.run_fio_test(
+                det["Client"], None, det["Mount"], det["Log"],
+                size=self.fio_size, name=f"{name}_fio", rw="randrw",
+                bs=self._short_bs(), nrfiles=8, iodepth=1, numjobs=2,
+                time_based=True, runtime=runtime, log_avg_msec=1000,
+                iolog_file=det["iolog_base_path"], max_latency="30s",
+                verify="md5", verify_dump=1, verify_fatal=1, retries=6,
+                use_latency=False
+            )
+
+        for lvol, det in self.lvol_mount_details.items():
+            self.ssh_obj.delete_files(det["Client"], [f"/mnt/{lvol}/*"])
+            t = threading.Thread(target=_launch, args=(lvol, det))
+            t.start()
+            self.fio_threads.append(t)
+            sleep_n_sec(0.2)
+
+        for cname, det in self.clone_mount_details.items():
+            self.ssh_obj.delete_files(det["Client"], [f"/mnt/{cname}/*"])
+            t = threading.Thread(target=_launch, args=(cname, det))
+            t.start()
+            self.fio_threads.append(t)
+            sleep_n_sec(0.2)
+
+    # ---------- outage flow ----------
+
+    def _perform_outage(self):
+        random.shuffle(self.sn_nodes)
+        self.current_outage_node = self.sn_nodes[0]
+        outage_type = self._pick_outage()
+
+        if self.first_outage_ts is None:
+            self.first_outage_ts = int(datetime.now().timestamp())
+
+        cur_node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
+        cur_node_ip = cur_node_details[0]["mgmt_ip"]
+        self.ssh_obj.fetch_distrib_logs(
+            storage_node_ip=cur_node_ip,
+            storage_node_id=self.current_outage_node,
+            logs_path=self.docker_logs_path
+        )
+        
+        # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
+        #                           storage_node_id=self.current_outage_node)
+
+        self.outage_start_time = int(datetime.now().timestamp())
+        self._log_outage_event(self.current_outage_node, outage_type, "Outage started")
+        self.logger.info(f"[LFNG] Outage={outage_type} node={self.current_outage_node}")
+
+        node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
+        node_ip = node_details[0]["mgmt_ip"]
+        node_rpc_port = node_details[0]["rpc_port"]
+
+        if outage_type == "graceful_shutdown":
+            # suspend -> wait SUSPENDED -> shutdown -> wait OFFLINE
+            try:
+                self.logger.info(f"[LFNG] Suspending node via: sbcli-dev sn suspend {self.current_outage_node}")
+                self.sbcli_utils.suspend_node(node_uuid=self.current_outage_node, expected_error_code=[503])
+                self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "suspended", timeout=600)
+            except Exception:
+                self.logger.warning("[LFNG] Suspend failed from API; ignoring if already suspended")
+
+            try:
+                self.sbcli_utils.shutdown_node(node_uuid=self.current_outage_node, force=True, expected_error_code=[503])
+            except Exception:
+                self.ssh_obj.shutdown_node(node=self.mgmt_nodes[0], node_id=self.current_outage_node, force=True)
+            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "offline", timeout=900)
+
+            for node in self.sn_nodes_with_sec:
+                if node != self.current_outage_node:
+                    cur_node_details = self.sbcli_utils.get_storage_node_details(node)
+                    cur_node_ip = cur_node_details[0]["mgmt_ip"]
+                    self.ssh_obj.fetch_distrib_logs(
+                        storage_node_ip=cur_node_ip,
+                        storage_node_id=node,
+                        logs_path=self.docker_logs_path
+                    )
+            # Keep node strictly offline for 5 minutes
+            sleep_n_sec(500)
+
+        elif outage_type == "container_stop":
+            self.ssh_obj.stop_spdk_process(node_ip, node_rpc_port)
+
+        elif outage_type == "interface_full_network_interrupt":
+            # Down all active data interfaces for ~300s (5 minutes) with ping verification
+            active = self.ssh_obj.get_active_interfaces(node_ip)
+            self.ssh_obj.disconnect_all_active_interfaces(node_ip, active, 300)
+            sleep_n_sec(280)
+
+        return outage_type
+
+    def restart_nodes_after_failover(self, outage_type):
+        node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
+
+        self.logger.info(f"[LFNG] Recover outage={outage_type} node={self.current_outage_node}")
+
+        cur_node_details = self.sbcli_utils.get_storage_node_details(self.sn_primary_secondary_map[self.current_outage_node])
+        cur_node_ip = cur_node_details[0]["mgmt_ip"]
+        self.ssh_obj.fetch_distrib_logs(
+            storage_node_ip=cur_node_ip,
+            storage_node_id=self.sn_primary_secondary_map[self.current_outage_node],
+            logs_path=self.docker_logs_path
+        )
+
+        # Only wait for ONLINE (skip deep health)
+        if outage_type == 'graceful_shutdown':
+            try:
+                self.ssh_obj.restart_node(self.mgmt_nodes[0], node_id=self.current_outage_node, force=True)
+            except Exception:
+                pass
+            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900)
+        elif outage_type == 'container_stop':
+            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900)
+        elif "network_interrupt" in outage_type:
+            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900)
+
+        self._log_outage_event(self.current_outage_node, outage_type, "Node online")
+        self.outage_end_time = int(datetime.now().timestamp())
+
+        cur_node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
+        cur_node_ip = cur_node_details[0]["mgmt_ip"]
+        self.ssh_obj.fetch_distrib_logs(
+            storage_node_ip=cur_node_ip,
+            storage_node_id=self.current_outage_node,
+            logs_path=self.docker_logs_path
+        )
+
+        # keep container log streaming going
+        if not self.k8s_test:
+            for node in self.storage_nodes:
+                self.ssh_obj.restart_docker_logging(
+                    node_ip=node,
+                    containers=self.container_nodes[node],
+                    log_dir=os.path.join(self.docker_logs_path, node),
+                    test_name=self.test_name
+                )
+        else:
+            self.runner_k8s_log.restart_logging()
+
+        # small cool-down before next outage to reduce SSH churn
+        # sleep_n_sec(random.randint(1, 5))
+
+    # ---------- main ----------
+
+    def run(self):
+        self.logger.info("[LFNG] Starting RandomRapidFailoverNoGap")
+        self._bootstrap_cluster()
+        sleep_n_sec(5)
+
+        iteration = 1
+        while True:
+            outage_type = self._perform_outage()
+            self.restart_nodes_after_failover(outage_type)
+
+            self._iter += 1
+            if self._iter % self.validate_every == 0:
+                self.logger.info(f"[LFNG] {self._iter} outages → wait & validate all FIO")
+                # Join launch threads so we know all jobs issued
+                for t in self.fio_threads:
+                    t.join(timeout=10)
+                self.fio_threads = []
+
+                # Wait for all fio jobs to end (they’re 30min jobs)
+                self.common_utils.manage_fio_threads(self.fio_node, [], timeout=self._fio_wait_timeout)
+
+                for node in self.sn_nodes_with_sec:
+                    cur_node_details = self.sbcli_utils.get_storage_node_details(node)
+                    cur_node_ip = cur_node_details[0]["mgmt_ip"]
+                    self.ssh_obj.fetch_distrib_logs(
+                        storage_node_ip=cur_node_ip,
+                        storage_node_id=node,
+                        logs_path=self.docker_logs_path
+                    )
+                
+                    self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
+                                              storage_node_id=node)
+
+                # Validate logs
+                for lvol, det in self.lvol_mount_details.items():
+                    self.common_utils.validate_fio_test(det["Client"], log_file=det["Log"])
+                for cname, det in self.clone_mount_details.items():
+                    self.common_utils.validate_fio_test(det["Client"], log_file=det["Log"])
+
+                # Optional: wait for migration window after FIO completes
+                # (replace with your actual migration-check, if any)
+                self.logger.info("[LFNG] FIO validated; pausing briefly for migration window")
+                sleep_n_sec(10)
+
+                # Re-kick next 30min wave
+                self._kick_fio_for_all(runtime=self._per_wave_fio_runtime)
+                self.logger.info("[LFNG] Next FIO wave started")
+
+            self.logger.info(f"[LFNG] Iter {iteration} complete → starting next outage ASAP")
+            iteration += 1
\ No newline at end of file
diff --git a/e2e/stress_test/continuous_failover_ha_multi_outage.py b/e2e/stress_test/continuous_failover_ha_multi_outage.py
index fb5f6d507..e96a0b547 100644
--- a/e2e/stress_test/continuous_failover_ha_multi_outage.py
+++ b/e2e/stress_test/continuous_failover_ha_multi_outage.py
@@ -1,5 +1,6 @@
 from utils.common_utils import sleep_n_sec
 from datetime import datetime
+from collections import defaultdict
 from stress_test.continuous_failover_ha_multi_client import RandomMultiClientFailoverTest
 from exceptions.custom_exception import LvolNotConnectException
 import threading
@@ -8,13 +9,20 @@
 import os
 
 
+generated_sequences = set()
+
 def generate_random_sequence(length):
     letters = string.ascii_uppercase
     numbers = string.digits
     all_chars = letters + numbers
-    first_char = random.choice(letters)
-    remaining_chars = ''.join(random.choices(all_chars, k=length - 1))
-    return first_char + remaining_chars
+
+    while True:
+        first_char = random.choice(letters)
+        remaining_chars = ''.join(random.choices(all_chars, k=length-1))
+        result = first_char + remaining_chars
+        if result not in generated_sequences:
+            generated_sequences.add(result)
+            return result
 
 
 class RandomMultiClientMultiFailoverTest(RandomMultiClientFailoverTest):
@@ -25,7 +33,7 @@ class RandomMultiClientMultiFailoverTest(RandomMultiClientFailoverTest):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.total_lvols = 20
+        self.total_lvols = 40
         self.lvol_name = f"lvl{generate_random_sequence(15)}"
         self.clone_name = f"cln{generate_random_sequence(15)}"
         self.snapshot_name = f"snap{generate_random_sequence(15)}"
@@ -48,9 +56,12 @@ def __init__(self, **kwargs):
         self.lvols_without_sec_connect = []
         self.test_name = "n_plus_k_failover_multi_client_ha"
         self.outage_types = [
+            "graceful_shutdown",
+            "interface_full_network_interrupt"
+        ]
+        self.outage_types2 = [
             "container_stop",
             "graceful_shutdown",
-            "interface_partial_network_interrupt",
             "interface_full_network_interrupt"
         ]
         self.blocked_ports = None
@@ -61,30 +72,101 @@ def _initialize_outage_log(self):
         with open(self.outage_log_file, 'w') as log:
             log.write("Timestamp,Node,Outage_Type,Event\n")
 
-    def log_outage_event(self, node, outage_type, event):
-        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    def log_outage_event(self, node, outage_type, event, outage_time=0):
+        """Log an outage event to the outage log file.
+
+        Args:
+            node (str): Node UUID or IP where the event occurred.
+            outage_type (str): Type of outage (e.g., port_network_interrupt, container_stop, graceful_shutdown).
+            event (str): Event description (e.g., 'Outage started', 'Node restarted').
+            outage_time (int): Minutes to add to self.outage_start_time. If 0/None, use current time.
+        """
+        # Compute timestamp
+        if outage_time:
+            # Uses self.outage_start_time (epoch seconds) + outage_time (minutes)
+            base_epoch = getattr(self, "outage_start_time", None)
+            if isinstance(base_epoch, (int, float)) and base_epoch > 0:
+                ts_dt = datetime.fromtimestamp(int(base_epoch) + int(outage_time) * 60)
+            else:
+                # Fallback to now if outage_start_time is missing/invalid
+                ts_dt = datetime.now()
+        else:
+            ts_dt = datetime.now()
+
+        timestamp = ts_dt.strftime('%Y-%m-%d %H:%M:%S')
+
+        # Write the log line
         with open(self.outage_log_file, 'a') as log:
             log.write(f"{timestamp},{node},{outage_type},{event}\n")
 
+    def _build_reverse_secondary_map(self):
+        rev = defaultdict(set)        # secondary -> {primary,...}
+        for p, s in self.sn_primary_secondary_map.items():
+            if s:
+                rev[s].add(p)
+        return rev
+
+    def _pick_outage_nodes(self, primary_candidates, k):
+        rev = self._build_reverse_secondary_map()
+        order = primary_candidates[:]
+
+        random.shuffle(order)
+
+        chosen, blocked = [], set()
+        for node in order:
+            if node in blocked:
+                continue
+
+            chosen.append(node)
+            blocked.add(node)                            # itself
+            sec = self.sn_primary_secondary_map.get(node)
+            if sec:
+                blocked.add(sec)                         # its secondary
+            blocked.update(rev.get(node, ()))           # any primary whose secondary == node
+
+            if len(chosen) == k:
+                break
+
+        if len(chosen) < k:
+            raise Exception(
+                f"Cannot pick {k} nodes without primary/secondary conflicts; only {len(chosen)} possible with current topology."
+            )
+        return chosen
+
     def perform_n_plus_k_outages(self):
         """
-        Perform K (self.npcs) parallel outages as part of N+K configuration.
-        Ensure only primary nodes are selected for outage.
+        Select K outage nodes such that no two are in a primary/secondary
+        relationship (in either direction). Candidates = keys of the map.
         """
-        primary_nodes = [node for node in self.sn_nodes if not self.sbcli_utils.is_secondary_node(node)]
+        # Candidates are nodes that are primary *for someone* (map keys)
+        primary_candidates = list(self.sn_primary_secondary_map.keys())
+        self.current_outage_nodes = []
 
-        if len(primary_nodes) < self.npcs:
-            raise Exception(f"Not enough primary nodes to perform {self.npcs} outages. Found only {len(primary_nodes)}.")
+        if len(primary_candidates) < self.npcs:
+            raise Exception(
+                f"Need {self.npcs} outage nodes, but only {len(primary_candidates)} primary-role nodes exist."
+            )
 
-        outage_nodes = random.sample(primary_nodes, k=self.npcs)
+        outage_nodes = self._pick_outage_nodes(primary_candidates, self.npcs)
+        self.logger.info(f"Selected outage nodes: {outage_nodes}")
         outage_combinations = []
-
+        outage_num = 0
         for node in outage_nodes:
-            outage_type = random.choice(self.outage_types)
+            if outage_num == 0:
+                outage_type = random.choice(self.outage_types)
+                outage_num = 1
+            else:
+                outage_type = random.choice(self.outage_types2)
             node_details = self.sbcli_utils.get_storage_node_details(node)
             node_ip = node_details[0]["mgmt_ip"]
             node_rpc_port = node_details[0]["rpc_port"]
 
+            self.ssh_obj.fetch_distrib_logs(
+                storage_node_ip=node_ip,
+                storage_node_id=node,
+                logs_path=self.docker_logs_path
+            )
+
             self.logger.info(f"Performing {outage_type} on primary node {node}.")
             self.log_outage_event(node, outage_type, "Outage started")
 
@@ -105,26 +187,74 @@ def perform_n_plus_k_outages(self):
 
     def _graceful_shutdown_node(self, node):
         try:
-            self.sbcli_utils.suspend_node(node_uuid=node, expected_error_code=[503])
-            self.sbcli_utils.wait_for_storage_node_status(node, "suspended", timeout=1000)
-            self.sbcli_utils.shutdown_node(node_uuid=node, expected_error_code=[503])
-            self.sbcli_utils.wait_for_storage_node_status(node, "offline", timeout=1000)
+            sleep_n_sec(10)
+            max_retries = 10
+            retry_delay = 10  # seconds
+            # Retry mechanism for suspending the node
+            for attempt in range(max_retries):
+                try:
+                    if attempt == max_retries - 1:
+                        self.logger.info("[CHECK] Suspending Node via CLI as via API Fails.")
+                        self.ssh_obj.suspend_node(node=self.mgmt_nodes[0],
+                                                  node_id=node)
+                    else:
+                        self.sbcli_utils.suspend_node(node_uuid=node, expected_error_code=[503])
+                    self.sbcli_utils.wait_for_storage_node_status(node, "suspended", timeout=1000)
+                    break  # Exit loop if successful
+                except Exception as _:
+                    if attempt < max_retries - 2:
+                        self.logger.info(f"Attempt {attempt + 1} failed to suspend node. Retrying in {retry_delay} seconds...")
+                        sleep_n_sec(retry_delay)
+                    elif attempt < max_retries - 1:
+                        self.logger.info(f"Attempt {attempt + 1} failed to suspend node via API. Retrying in {retry_delay} seconds via CMD...")
+                        sleep_n_sec(retry_delay)
+                    else:
+                        self.logger.info("Max retries reached. Failed to suspend node.")
+                        raise  # Rethrow the last exception
+
+            sleep_n_sec(10)  # Wait before shutting down
+
+            # Retry mechanism for shutting down the node
+            for attempt in range(max_retries):
+                try:
+                    if attempt == max_retries - 1:
+                        self.logger.info("[CHECK] Shutting down Node via CLI as via API Fails.")
+                        self.ssh_obj.shutdown_node(node=self.mgmt_nodes[0],
+                                                   node_id=node,
+                                                   force=True)
+                    else:
+                        self.sbcli_utils.shutdown_node(node_uuid=node, force=True,
+                                                       expected_error_code=[503])
+                    self.sbcli_utils.wait_for_storage_node_status(node, "offline", timeout=1000)
+                    break  # Exit loop if successful
+                except Exception as _:
+                    if attempt < max_retries - 2:
+                        self.logger.info(f"Attempt {attempt + 1} failed to shutdown node. Retrying in {retry_delay} seconds...")
+                        sleep_n_sec(retry_delay)
+                    elif attempt < max_retries - 1:
+                        self.logger.info(f"Attempt {attempt + 1} failed to shutdown node via API. Retrying in {retry_delay} seconds via CMD...")
+                        sleep_n_sec(retry_delay)
+                    else:
+                        self.logger.info("Max retries reached. Failed to shutdown node.")
+                        raise  # Rethrow the last exception
         except Exception as e:
             self.logger.error(f"Failed graceful shutdown for node {node}: {str(e)}")
 
     def _disconnect_partial_interface(self, node, node_ip):
         active_interfaces = [nic["if_name"] for nic in self.sbcli_utils.get_storage_node_details(node)[0]["data_nics"]]
+        active_interfaces = ['eth1']
         self.disconnect_thread = threading.Thread(
             target=self.ssh_obj.disconnect_all_active_interfaces,
-            args=(node_ip, active_interfaces, 600)
+            args=(node_ip, active_interfaces, 300)
         )
         self.disconnect_thread.start()
 
     def _disconnect_full_interface(self, node, node_ip):
+        self.logger.info("Handling full interface based network interruption...")
         active_interfaces = self.ssh_obj.get_active_interfaces(node_ip)
         self.disconnect_thread = threading.Thread(
             target=self.ssh_obj.disconnect_all_active_interfaces,
-            args=(node_ip, active_interfaces, 600)
+            args=(node_ip, active_interfaces, 300)
         )
         self.disconnect_thread.start()
 
@@ -134,50 +264,81 @@ def delete_random_lvols(self, count):
             lvol for node, lvols in self.node_vs_lvol.items()
             if node not in self.current_outage_nodes for lvol in lvols
         ]
+
+        self.logger.info(f"Available Lvols: {available_lvols}")
         if len(available_lvols) < count:
             self.logger.warning("Not enough lvols available to delete the requested count.")
             count = len(available_lvols)
 
         for lvol in random.sample(available_lvols, count):
-            self.logger.info(f"Deleting lvol {lvol}")
+            self.logger.info(f"Deleting lvol {lvol}.")
             snapshots = self.lvol_mount_details[lvol]["snapshots"]
             to_delete = []
-
-            # Handle dependent clones
             for clone_name, clone_details in self.clone_mount_details.items():
                 if clone_details["snapshot"] in snapshots:
-                    self.common_utils.validate_fio_test(clone_details["Client"], clone_details["Log"])
+                    self.common_utils.validate_fio_test(clone_details["Client"],
+                                                        log_file=clone_details["Log"])
                     self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=False)
                     fio_pids = self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=True)
+                    sleep_n_sec(10)
                     for pid in fio_pids:
                         self.ssh_obj.kill_processes(clone_details["Client"], pid=pid)
+                    attempt = 1
+                    while len(fio_pids) > 2:
+                        self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=False)
+                        fio_pids = self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=True)
+                        if attempt >= 30:
+                            raise Exception("FIO not killed on clone")
+                        attempt += 1
+                        sleep_n_sec(20)
+                    
+                    sleep_n_sec(10)
                     self.ssh_obj.unmount_path(clone_details["Client"], f"/mnt/{clone_name}")
                     self.ssh_obj.remove_dir(clone_details["Client"], dir_path=f"/mnt/{clone_name}")
                     self.disconnect_lvol(clone_details['ID'])
-                    self.sbcli_utils.delete_lvol(clone_name)
+                    self.sbcli_utils.delete_lvol(clone_name, max_attempt=20, skip_error=True)
+                    sleep_n_sec(30)
                     if clone_name in self.lvols_without_sec_connect:
                         self.lvols_without_sec_connect.remove(clone_name)
                     to_delete.append(clone_name)
-
+                    self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone_name}_fio*"])
+                    self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone_name}_fio_iolog*"])
+                    self.ssh_obj.delete_files(clone_details["Client"], [f"/mnt/{clone_name}/*"])
+                    # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone_name}*.log"])
             for del_key in to_delete:
                 del self.clone_mount_details[del_key]
-
-            # Delete snapshots
             for snapshot in snapshots:
                 snapshot_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snapshot)
+                # snapshot_node = self.snap_vs_node[snapshot]
+                # if snapshot_node not in skip_nodes:
                 self.ssh_obj.delete_snapshot(self.mgmt_nodes[0], snapshot_id=snapshot_id)
                 self.snapshot_names.remove(snapshot)
 
-            # Stop FIO and cleanup lvol
-            self.common_utils.validate_fio_test(self.lvol_mount_details[lvol]["Client"], self.lvol_mount_details[lvol]["Log"])
+            self.common_utils.validate_fio_test(self.lvol_mount_details[lvol]["Client"],
+                                                log_file=self.lvol_mount_details[lvol]["Log"])
             self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=False)
+            sleep_n_sec(10)
             fio_pids = self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=True)
             for pid in fio_pids:
                 self.ssh_obj.kill_processes(self.lvol_mount_details[lvol]["Client"], pid=pid)
+            attempt = 1
+            while len(fio_pids) > 2:
+                self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=False)
+                fio_pids = self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=True)
+                if attempt >= 30:
+                    raise Exception("FIO not killed on lvols")
+                attempt += 1
+                sleep_n_sec(20)
+
+            sleep_n_sec(10)
             self.ssh_obj.unmount_path(self.lvol_mount_details[lvol]["Client"], f"/mnt/{lvol}")
             self.ssh_obj.remove_dir(self.lvol_mount_details[lvol]["Client"], dir_path=f"/mnt/{lvol}")
             self.disconnect_lvol(self.lvol_mount_details[lvol]['ID'])
-            self.sbcli_utils.delete_lvol(lvol)
+            self.sbcli_utils.delete_lvol(lvol, max_attempt=20, skip_error=True)
+            self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/local-{lvol}_fio*"])
+            self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"])
+            self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"/mnt/{lvol}/*"])
+            # self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/{lvol}*.log"])
             if lvol in self.lvols_without_sec_connect:
                 self.lvols_without_sec_connect.remove(lvol)
             del self.lvol_mount_details[lvol]
@@ -190,14 +351,19 @@ def delete_random_lvols(self, count):
     def create_snapshots_and_clones(self):
         """Create snapshots and clones during an outage, avoiding lvols on outage nodes."""
         self.int_lvol_size += 1
+        skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] in self.current_outage_nodes]
+        self.logger.info(f"Skip Nodes: {skip_nodes}")
+        for node in self.current_outage_nodes:
+            skip_nodes.append(node)
+        self.logger.info(f"Skip Nodes: {skip_nodes}")
         available_lvols = [
             lvol for node, lvols in self.node_vs_lvol.items()
-            if node not in self.current_outage_nodes for lvol in lvols
+            if node not in skip_nodes for lvol in lvols
         ]
         if not available_lvols:
             self.logger.warning("No available lvols to create snapshots and clones.")
             return
-
+        self.logger.info(f"Available lvols: {available_lvols}")
         for _ in range(3):
             random.shuffle(available_lvols)
             lvol = available_lvols[0]
@@ -205,69 +371,140 @@ def create_snapshots_and_clones(self):
             temp_name = generate_random_sequence(5)
             if snapshot_name in self.snapshot_names:
                 snapshot_name = f"{snapshot_name}_{temp_name}"
-
             try:
                 output, error = self.ssh_obj.add_snapshot(self.mgmt_nodes[0], self.lvol_mount_details[lvol]["ID"], snapshot_name)
-                if "(False," in output or "(False," in error:
-                    raise Exception(output or error)
+                if "(False," in output:
+                    raise Exception(output)
+                if "(False," in error:
+                    raise Exception(error)
             except Exception as e:
-                self.logger.warning(f"Snapshot creation failed: {e}")
-                continue
-
+                self.logger.warning(f"Snap creation fails with {str(e)}. Retrying with different name.")
+                try:
+                    snapshot_name = f"snap_{lvol}"
+                    temp_name = generate_random_sequence(5)
+                    snapshot_name = f"{snapshot_name}_{temp_name}"
+                    self.ssh_obj.add_snapshot(self.mgmt_nodes[0], self.lvol_mount_details[lvol]["ID"], snapshot_name)
+                except Exception as exp:
+                    self.logger.warning(f"Retry Snap creation fails with {str(exp)}.")
+                    continue
+                
             self.snapshot_names.append(snapshot_name)
+            lvol_node_id = self.sbcli_utils.get_lvol_details(
+                lvol_id=self.lvol_mount_details[lvol]["ID"])[0]["node_id"]
+            self.snap_vs_node[snapshot_name] = lvol_node_id
             self.lvol_mount_details[lvol]["snapshots"].append(snapshot_name)
-
             clone_name = f"clone_{generate_random_sequence(15)}"
+            if clone_name in list(self.clone_mount_details):
+                clone_name = f"{clone_name}_{temp_name}"
             sleep_n_sec(30)
             snapshot_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snapshot_name)
             try:
                 self.ssh_obj.add_clone(self.mgmt_nodes[0], snapshot_id, clone_name)
             except Exception as e:
-                self.logger.warning(f"Clone creation failed: {e}")
-                continue
-
+                self.logger.warning(f"Clone creation fails with {str(e)}. Retrying with different name.")
+                try:
+                    clone_name = f"clone_{generate_random_sequence(15)}"
+                    temp_name = generate_random_sequence(5)
+                    clone_name = f"{clone_name}_{temp_name}"
+                    self.ssh_obj.add_clone(self.mgmt_nodes[0], snapshot_id, clone_name)
+                except Exception as exp:
+                    self.logger.warning(f"Retry Clone creation fails with {str(exp)}.")
+                    continue
             fs_type = self.lvol_mount_details[lvol]["FS"]
             client = self.lvol_mount_details[lvol]["Client"]
             self.clone_mount_details[clone_name] = {
-                "ID": self.sbcli_utils.get_lvol_id(clone_name),
-                "Command": None,
-                "Mount": None,
-                "Device": None,
-                "MD5": None,
-                "FS": fs_type,
-                "Log": f"{self.log_path}/{clone_name}.log",
-                "snapshot": snapshot_name,
-                "Client": client
+                   "ID": self.sbcli_utils.get_lvol_id(clone_name),
+                   "Command": None,
+                   "Mount": None,
+                   "Device": None,
+                   "MD5": None,
+                   "FS": fs_type,
+                   "Log": f"{self.log_path}/{clone_name}.log",
+                   "snapshot": snapshot_name,
+                   "Client": client,
+                   "iolog_base_path": f"{self.log_path}/{clone_name}_fio_iolog"
             }
 
+            self.logger.info(f"Created clone {clone_name}.")
+
+            sleep_n_sec(3)
+
+            self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
+                                      command=f"{self.base_cmd} lvol list")
+
             connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=clone_name)
             self.clone_mount_details[clone_name]["Command"] = connect_ls
+
+            # if self.secondary_outage:
+            #     connect_ls = [connect_ls[0]]
+            #     self.lvols_without_sec_connect.append(clone_name)
+
             initial_devices = self.ssh_obj.get_devices(node=client)
             for connect_str in connect_ls:
                 _, error = self.ssh_obj.exec_command(node=client, command=connect_str)
                 if error:
-                    self.logger.warning(f"Clone connect failed: {error}")
+                    lvol_details = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])
+                    nqn = lvol_details[0]["nqn"]
+                    self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn)
+                    self.logger.info(f"Connecting clone {clone_name} has error: {error}. Disconnect all connections for that clone!!")
+                    self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True)
+                    sleep_n_sec(30)
+                    del self.clone_mount_details[clone_name]
                     continue
 
+            sleep_n_sec(3)
             final_devices = self.ssh_obj.get_devices(node=client)
-            lvol_device = next((f"/dev/{d.strip()}" for d in final_devices if d not in initial_devices), None)
+            lvol_device = None
+            for device in final_devices:
+                if device not in initial_devices:
+                    lvol_device = f"/dev/{device.strip()}"
+                    break
             if not lvol_device:
-                raise LvolNotConnectException("Clone device not found")
+                raise LvolNotConnectException("LVOL did not connect")
             self.clone_mount_details[clone_name]["Device"] = lvol_device
 
+            # Mount and Run FIO
             if fs_type == "xfs":
                 self.ssh_obj.clone_mount_gen_uuid(client, lvol_device)
-
             mount_point = f"{self.mount_path}/{clone_name}"
             self.ssh_obj.mount_path(node=client, device=lvol_device, mount_path=mount_point)
             self.clone_mount_details[clone_name]["Mount"] = mount_point
 
+            # clone_node_id = self.sbcli_utils.get_lvol_details(
+            #     lvol_id=self.lvol_mount_details[clone_name]["ID"])[0]["node_id"]
+            
+            # self.node_vs_lvol[clone_node_id].append(clone_name)
+
+            sleep_n_sec(10)
+
             self.ssh_obj.delete_files(client, [f"{mount_point}/*fio*"])
             self.ssh_obj.delete_files(client, [f"{self.log_path}/local-{clone_name}_fio*"])
-
+            self.ssh_obj.delete_files(client, [f"{self.log_path}/{clone_name}_fio_iolog*"])
+
+            sleep_n_sec(5)
+
+            # Start FIO
+            # fio_thread = threading.Thread(
+            #     target=self.ssh_obj.run_fio_test,
+            #     args=(client, None, self.clone_mount_details[clone_name]["Mount"], self.clone_mount_details[clone_name]["Log"]),
+            #     kwargs={
+            #         "size": self.fio_size,
+            #         "name": f"{clone_name}_fio",
+            #         "rw": "randrw",
+            #         "bs": f"{2 ** random.randint(2, 7)}K",
+            #         "nrfiles": 16,
+            #         "iodepth": 1,
+            #         "numjobs": 5,
+            #         "time_based": True,
+            #         "runtime": 2000,
+            #         "log_avg_msec": 1000,
+            #         "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"],
+            #         "debug": True,
+            #     },
+            # )
             fio_thread = threading.Thread(
                 target=self.ssh_obj.run_fio_test,
-                args=(client, None, mount_point, self.clone_mount_details[clone_name]["Log"]),
+                args=(client, None, self.clone_mount_details[clone_name]["Mount"], self.clone_mount_details[clone_name]["Log"]),
                 kwargs={
                     "size": self.fio_size,
                     "name": f"{clone_name}_fio",
@@ -278,15 +515,21 @@ def create_snapshots_and_clones(self):
                     "numjobs": 5,
                     "time_based": True,
                     "runtime": 2000,
+                    "log_avg_msec": 1000,
+                    "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"],
                 },
             )
             fio_thread.start()
             self.fio_threads.append(fio_thread)
+            self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}.")
 
-            self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}")
-            self.sbcli_utils.resize_lvol(self.lvol_mount_details[lvol]["ID"], f"{self.int_lvol_size}G")
+            if self.lvol_mount_details[lvol]["ID"]:
+                self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"],
+                                             new_size=f"{self.int_lvol_size}G")
             sleep_n_sec(10)
-            self.sbcli_utils.resize_lvol(self.clone_mount_details[clone_name]["ID"], f"{self.int_lvol_size}G")
+            if self.clone_mount_details[clone_name]["ID"]:
+                self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"],
+                                             new_size=f"{self.int_lvol_size}G")
 
 
     def run(self):
@@ -301,6 +544,8 @@ def run(self):
         for result in storage_nodes['results']:
             self.sn_nodes.append(result["uuid"])
             self.sn_nodes_with_sec.append(result["uuid"])
+            self.sn_primary_secondary_map[result["uuid"]] = result["secondary_node_id"]
+        self.logger.info(f"Secondary node map: {self.sn_primary_secondary_map}")
 
         sleep_n_sec(30)
 
@@ -320,11 +565,23 @@ def run(self):
 
             for node, outage_type in outage_events:
                 self.current_outage_node = node
-                self.restart_nodes_after_failover(outage_type)
+                if outage_type == "container_stop" and self.npcs > 1:
+                    self.restart_nodes_after_failover(outage_type, True)
+                else:
+                    self.restart_nodes_after_failover(outage_type)
 
             self.logger.info("Waiting for fallback recovery.")
             sleep_n_sec(100)
 
+            for node in self.sn_nodes_with_sec:
+                cur_node_details = self.sbcli_utils.get_storage_node_details(node)
+                cur_node_ip = cur_node_details[0]["mgmt_ip"]
+                self.ssh_obj.fetch_distrib_logs(
+                    storage_node_ip=cur_node_ip,
+                    storage_node_id=node,
+                    logs_path=self.docker_logs_path
+                )
+
             time_duration = self.common_utils.calculate_time_duration(
                 start_timestamp=self.outage_start_time,
                 end_timestamp=self.outage_end_time
@@ -343,12 +600,27 @@ def run(self):
             # for node, outage_type in outage_events:
             #     if not self.sbcli_utils.is_secondary_node(node):
             self.validate_migration_for_node(self.outage_start_time, 2000, None, 60, no_task_ok=no_task_ok)
+            self.common_utils.manage_fio_threads(self.fio_node, self.fio_threads, timeout=20000)
 
             for clone, clone_details in self.clone_mount_details.items():
                 self.common_utils.validate_fio_test(clone_details["Client"], clone_details["Log"])
+                self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"])
+                self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"])
 
             for lvol, lvol_details in self.lvol_mount_details.items():
                 self.common_utils.validate_fio_test(lvol_details["Client"], lvol_details["Log"])
+                self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"])
+                self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"])
 
             self.logger.info(f"N+K failover iteration {iteration} complete.")
+
+            for node in self.sn_nodes_with_sec:
+                cur_node_details = self.sbcli_utils.get_storage_node_details(node)
+                cur_node_ip = cur_node_details[0]["mgmt_ip"]
+                self.ssh_obj.fetch_distrib_logs(
+                    storage_node_ip=cur_node_ip,
+                    storage_node_id=node,
+                    logs_path=self.docker_logs_path
+                )
             iteration += 1
+
diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py
index bd06f06f7..ee265d507 100644
--- a/e2e/utils/ssh_utils.py
+++ b/e2e/utils/ssh_utils.py
@@ -13,6 +13,10 @@
 import string
 import re
 import subprocess
+import shlex
+import socket
+from collections import defaultdict
+from typing import Optional, List
 
 
 SSH_KEY_LOCATION = os.path.join(Path.home(), ".ssh", os.environ.get("KEY_NAME"))
@@ -47,31 +51,227 @@ def __init__(self, bastion_server):
         self.log_monitor_threads = {}
         self.log_monitor_stop_flags = {}
         self.ssh_semaphore = threading.Semaphore(10)  # Max 10 SSH calls in parallel (tune as needed)
+        self._bastion_client = None
+        self._reconnect_locks = defaultdict(threading.Lock)   
+        self.ssh_pass = None
+
+    def _candidate_usernames(self, explicit_user) -> List[str]:
+        if explicit_user:
+            if isinstance(explicit_user, (list, tuple)):
+                return list(explicit_user)
+            return [str(explicit_user)]
+        return ["ec2-user", "ubuntu", "rocky", "root"]
+    
+    def _load_private_keys(self) -> List[paramiko.PKey]:
+        """
+        Try Ed25519 then RSA. If SSH_KEY_LOCATION/env points to a file, use it.
+        Else try ~/.ssh/id_ed25519 and ~/.ssh/id_rsa. If SSH_KEY_PATH is a dir, load all files from it.
+        """
+        paths = []
+        # explicit single file via KEY_NAME → SSH_KEY_LOCATION
+        if SSH_KEY_LOCATION and os.path.isfile(SSH_KEY_LOCATION):
+            paths.append(SSH_KEY_LOCATION)
+        # defaults
+        home = os.path.join(Path.home(), ".ssh")
+        paths.extend([os.path.join(home, "id_ed25519"), os.path.join(home, "id_rsa")])
+
+        keys = []
+        seen = set()
+        for p in paths:
+            if not os.path.exists(p) or p in seen:
+                continue
+            seen.add(p)
+            try:
+                keys.append(paramiko.Ed25519Key.from_private_key_file(p))
+                continue
+            except Exception:
+                pass
+            try:
+                keys.append(paramiko.RSAKey.from_private_key_file(p))
+            except Exception:
+                pass
+        if not keys and not self.ssh_pass:
+            raise FileNotFoundError("No usable SSH private key found and SSH_PASS not set.")
+        return keys
+
+    def _try_connect(self, host: str, username: str, pkey: Optional[paramiko.PKey], password: Optional[str], sock=None, timeout=30):
+        cli = paramiko.SSHClient()
+        cli.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        cli.connect(
+            hostname=host,
+            username=username,
+            pkey=pkey,
+            password=(password if pkey is None else None),
+            timeout=timeout,
+            banner_timeout=timeout,
+            auth_timeout=timeout,
+            allow_agent=False,
+            look_for_keys=False,
+            sock=sock
+        )
+        return cli
+
+    # def connect(self, address: str, port: int = 22,
+    #         bastion_server_address: str = None,
+    #         username: str = "ec2-user",
+    #         is_bastion_server: bool = False):
+    #     """Connect to cluster nodes"""
+    #     # --- prep usernames list ---
+    #     default_users = ["ec2-user", "ubuntu", "rocky", "root"]
+    #     if getattr(self, "ssh_user", None):
+    #         if isinstance(self.ssh_user, (list, tuple)):
+    #             usernames = list(self.ssh_user)
+    #         else:
+    #             usernames = [str(self.ssh_user)]
+    #     else:
+    #         usernames = default_users
+
+    #     # Load key (Ed25519 -> RSA fallback)
+    #     if not os.path.exists(SSH_KEY_LOCATION):
+    #         raise FileNotFoundError(f"SSH private key not found at {SSH_KEY_LOCATION}")
+    #     try:
+    #         private_key = paramiko.Ed25519Key(filename=SSH_KEY_LOCATION)
+    #     except Exception:
+    #         private_key = paramiko.RSAKey.from_private_key_file(SSH_KEY_LOCATION)
+
+    #     # Helper to store/replace a connection
+    #     def _store(host, client):
+    #         if self.ssh_connections.get(host):
+    #             try:
+    #                 self.ssh_connections[host].close()
+    #             except Exception:
+    #                 pass
+    #         self.ssh_connections[host] = client
+
+    #     # ---------- direct connection ----------
+    #     bastion_server_address = bastion_server_address or self.bastion_server
+    #     if not bastion_server_address:
+    #         self.logger.info(f"Connecting directly to {address} on port {port}...")
+    #         last_err = None
+    #         for user in usernames:
+    #             ssh = paramiko.SSHClient()
+    #             ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    #             try:
+    #                 ssh.connect(
+    #                     hostname=address,
+    #                     username=user,
+    #                     port=port,
+    #                     pkey=private_key,
+    #                     timeout=300,
+    #                     banner_timeout=30,
+    #                     auth_timeout=30,
+    #                     allow_agent=False,
+    #                     look_for_keys=False,
+    #                 )
+    #                 self.logger.info(f"Connected directly to {address} as '{user}'.")
+    #                 _store(address, ssh)
+    #                 return
+    #             except Exception as e:
+    #                 last_err = e
+    #                 self.logger.info(f"Direct login failed for '{user}': {repr(e)}")
+    #                 try:
+    #                     ssh.close()
+    #                 except Exception:
+    #                     pass
+    #         raise Exception(f"All usernames failed for {address}. Last error: {repr(last_err)}")
+
+    #     # ---------- connect to bastion ----------
+    #     self.logger.info(f"Connecting to bastion server {bastion_server_address}...")
+    #     bastion_ssh = paramiko.SSHClient()
+    #     bastion_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    #     last_err = None
+    #     bastion_user_used = None
+    #     for b_user in usernames:
+    #         try:
+    #             bastion_ssh.connect(
+    #                 hostname=bastion_server_address,
+    #                 username=b_user,
+    #                 port=port,
+    #                 pkey=private_key,
+    #                 timeout=300,
+    #                 banner_timeout=30,
+    #                 auth_timeout=30,
+    #                 allow_agent=False,
+    #                 look_for_keys=False,
+    #             )
+    #             self.logger.info(f"Connected to bastion as '{b_user}'.")
+    #             _store(bastion_server_address, bastion_ssh)
+    #             bastion_user_used = b_user
+    #             break
+    #         except Exception as e:
+    #             last_err = e
+    #             self.logger.info(f"Bastion login failed for '{b_user}': {repr(e)}")
+    #     if bastion_user_used is None:
+    #         raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}")
+    #     if is_bastion_server:
+    #         return  # caller only needed bastion
+
+    #     # ---------- tunnel to target through bastion ----------
+    #     self.logger.info(f"Connecting to target server {address} through bastion server...")
+    #     transport = bastion_ssh.get_transport()
+    #     last_err = None
+    #     for user in usernames:
+    #         # IMPORTANT: open a NEW channel for each username attempt
+    #         try:
+    #             channel = transport.open_channel(
+    #                 "direct-tcpip",
+    #                 (address, port),
+    #                 ("localhost", 0),
+    #             )
+    #         except paramiko.ssh_exception.ChannelException as ce:
+    #             self.logger.error(
+    #                 f"Channel open failed: {repr(ce)} — check AllowTcpForwarding/PermitOpen on bastion."
+    #             )
+    #             raise
+    #         target_ssh = paramiko.SSHClient()
+    #         target_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    #         try:
+    #             target_ssh.connect(
+    #                 address,
+    #                 username=user,
+    #                 port=port,
+    #                 sock=channel,
+    #                 pkey=private_key,
+    #                 timeout=300,
+    #                 banner_timeout=30,
+    #                 auth_timeout=30,
+    #                 allow_agent=False,
+    #                 look_for_keys=False,
+    #             )
+    #             self.logger.info(f"Connected to {address} as '{user}' via bastion '{bastion_user_used}'.")
+    #             _store(address, target_ssh)
+    #             return
+    #         except Exception as e:
+    #             last_err = e
+    #             self.logger.info(f"Target login failed for '{user}': {repr(e)}")
+    #             try:
+    #                 target_ssh.close()
+    #             except Exception:
+    #                 pass
+    #             try:
+    #                 channel.close()
+    #             except Exception:
+    #                 pass
+
+    #     raise Exception(
+    #         f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}"
+    #     )
 
     def connect(self, address: str, port: int = 22,
             bastion_server_address: str = None,
             username: str = "ec2-user",
             is_bastion_server: bool = False):
-        """Connect to cluster nodes"""
-        # --- prep usernames list ---
-        default_users = ["ec2-user", "ubuntu", "rocky", "root"]
-        if getattr(self, "ssh_user", None):
-            if isinstance(self.ssh_user, (list, tuple)):
-                usernames = list(self.ssh_user)
-            else:
-                usernames = [str(self.ssh_user)]
-        else:
-            usernames = default_users
+        """
+        Connect to a host directly or via bastion, trying multiple usernames and keys,
+        with optional password fallback.
+        """
+        # Resolve bastion
+        bastion_server_address = bastion_server_address or self.bastion_server
 
-        # Load key (Ed25519 -> RSA fallback)
-        if not os.path.exists(SSH_KEY_LOCATION):
-            raise FileNotFoundError(f"SSH private key not found at {SSH_KEY_LOCATION}")
-        try:
-            private_key = paramiko.Ed25519Key(filename=SSH_KEY_LOCATION)
-        except Exception:
-            private_key = paramiko.RSAKey.from_private_key_file(SSH_KEY_LOCATION)
+        usernames = self._candidate_usernames(self.ssh_user or username)
+        keys = self._load_private_keys()
+        password = self.ssh_pass
 
-        # Helper to store/replace a connection
         def _store(host, client):
             if self.ssh_connections.get(host):
                 try:
@@ -80,230 +280,291 @@ def _store(host, client):
                     pass
             self.ssh_connections[host] = client
 
-        # ---------- direct connection ----------
-        bastion_server_address = bastion_server_address or self.bastion_server
+        # --- NO BASTION: direct connect ---
         if not bastion_server_address:
-            self.logger.info(f"Connecting directly to {address} on port {port}...")
             last_err = None
+            self.logger.info(f"Connecting directly to {address} on port {port}...")
             for user in usernames:
-                ssh = paramiko.SSHClient()
-                ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-                try:
-                    ssh.connect(
-                        hostname=address,
-                        username=user,
-                        port=port,
-                        pkey=private_key,
-                        timeout=300,
-                        banner_timeout=30,
-                        auth_timeout=30,
-                        allow_agent=False,
-                        look_for_keys=False,
-                    )
-                    self.logger.info(f"Connected directly to {address} as '{user}'.")
-                    _store(address, ssh)
-                    return
-                except Exception as e:
-                    last_err = e
-                    self.logger.info(f"Direct login failed for '{user}': {repr(e)}")
+                # try keys
+                for key in keys:
                     try:
-                        ssh.close()
-                    except Exception:
-                        pass
+                        cli = self._try_connect(address, user, key, None, timeout=30)
+                        self.logger.info(f"Connected directly to {address} as '{user}'.")
+                        _store(address, cli)
+                        return
+                    except Exception as e:
+                        last_err = e
+                # then password
+                if password:
+                    try:
+                        cli = self._try_connect(address, user, None, password, timeout=30)
+                        self.logger.info(f"Connected directly to {address} as '{user}' (password).")
+                        _store(address, cli)
+                        return
+                    except Exception as e:
+                        last_err = e
             raise Exception(f"All usernames failed for {address}. Last error: {repr(last_err)}")
 
-        # ---------- connect to bastion ----------
-        self.logger.info(f"Connecting to bastion server {bastion_server_address}...")
-        bastion_ssh = paramiko.SSHClient()
-        bastion_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-        last_err = None
-        bastion_user_used = None
-        for b_user in usernames:
-            try:
-                bastion_ssh.connect(
-                    hostname=bastion_server_address,
-                    username=b_user,
-                    port=port,
-                    pkey=private_key,
-                    timeout=300,
-                    banner_timeout=30,
-                    auth_timeout=30,
-                    allow_agent=False,
-                    look_for_keys=False,
-                )
-                self.logger.info(f"Connected to bastion as '{b_user}'.")
-                _store(bastion_server_address, bastion_ssh)
-                bastion_user_used = b_user
+        # --- VIA BASTION ---
+        # ensure bastion client (reuse if alive)
+        if (not self._bastion_client) or (not self._bastion_client.get_transport()) or (not self._bastion_client.get_transport().is_active()):
+            last_err = None
+            self.logger.info(f"Connecting to bastion server {bastion_server_address}...")
+            for b_user in self._candidate_usernames(self.ssh_user or username):
+                for key in keys:
+                    try:
+                        cli = self._try_connect(bastion_server_address, b_user, key, None, timeout=30)
+                        self._bastion_client = cli
+                        self.logger.info(f"Connected to bastion as '{b_user}'.")
+                        break
+                    except Exception as e:
+                        last_err = e
+                else:
+                    if password:
+                        try:
+                            cli = self._try_connect(bastion_server_address, b_user, None, password, timeout=30)
+                            self._bastion_client = cli
+                            self.logger.info(f"Connected to bastion as '{b_user}' (password).")
+                            break
+                        except Exception as e:
+                            last_err = e
+                    continue
                 break
-            except Exception as e:
-                last_err = e
-                self.logger.info(f"Bastion login failed for '{b_user}': {repr(e)}")
-        if bastion_user_used is None:
-            raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}")
+            if (not self._bastion_client) or (not self._bastion_client.get_transport()) or (not self._bastion_client.get_transport().is_active()):
+                raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}")
+
         if is_bastion_server:
-            return  # caller only needed bastion
+            # caller only wanted bastion connection open
+            _store(bastion_server_address, self._bastion_client)
+            return
 
-        # ---------- tunnel to target through bastion ----------
+        # open a channel through bastion → target
         self.logger.info(f"Connecting to target server {address} through bastion server...")
-        transport = bastion_ssh.get_transport()
+        bastion_transport = self._bastion_client.get_transport()
+
         last_err = None
         for user in usernames:
-            # IMPORTANT: open a NEW channel for each username attempt
-            try:
-                channel = transport.open_channel(
-                    "direct-tcpip",
-                    (address, port),
-                    ("localhost", 0),
-                )
-            except paramiko.ssh_exception.ChannelException as ce:
-                self.logger.error(
-                    f"Channel open failed: {repr(ce)} — check AllowTcpForwarding/PermitOpen on bastion."
-                )
-                raise
-            target_ssh = paramiko.SSHClient()
-            target_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-            try:
-                target_ssh.connect(
-                    address,
-                    username=user,
-                    port=port,
-                    sock=channel,
-                    pkey=private_key,
-                    timeout=300,
-                    banner_timeout=30,
-                    auth_timeout=30,
-                    allow_agent=False,
-                    look_for_keys=False,
-                )
-                self.logger.info(f"Connected to {address} as '{user}' via bastion '{bastion_user_used}'.")
-                _store(address, target_ssh)
-                return
-            except Exception as e:
-                last_err = e
-                self.logger.info(f"Target login failed for '{user}': {repr(e)}")
+            # new channel for each attempt
+            chan = bastion_transport.open_channel("direct-tcpip", (address, port), ("127.0.0.1", 0))
+            # try keys
+            for key in keys:
                 try:
-                    target_ssh.close()
-                except Exception:
-                    pass
+                    cli = self._try_connect(address, user, key, None, sock=chan, timeout=30)
+                    self.logger.info(f"Connected to {address} as '{user}' via bastion.")
+                    _store(address, cli)
+                    return
+                except Exception as e:
+                    last_err = e
+            # then password
+            if password:
                 try:
-                    channel.close()
-                except Exception:
-                    pass
-
-        raise Exception(
-            f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}"
-        )
-
+                    cli = self._try_connect(address, user, None, password, sock=chan, timeout=30)
+                    self.logger.info(f"Connected to {address} as '{user}' via bastion (password).")
+                    _store(address, cli)
+                    return
+                except Exception as e:
+                    last_err = e
+            try:
+                chan.close()
+            except Exception:
+                pass
+
+        raise Exception(f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}")
+
+
+
+    # def exec_command(self, node, command, timeout=360, max_retries=3, stream_callback=None, supress_logs=False):
+    #     """Executes a command on a given machine with streaming output and retry mechanism.
+
+    #     Args:
+    #         node (str): Machine to run command on.
+    #         command (str): Command to run.
+    #         timeout (int): Timeout in seconds.
+    #         max_retries (int): Number of retries in case of failures.
+    #         stream_callback (callable, optional): A callback function for streaming output. Defaults to None.
+
+    #     Returns:
+    #         tuple: Final output and error strings after command execution.
+    #     """
+    #     retry_count = 0
+    #     while retry_count < max_retries:
+    #         with self.ssh_semaphore:
+    #             ssh_connection = self.ssh_connections.get(node)
+    #             try:
+    #                 # Ensure the SSH connection is active, otherwise reconnect
+    #                 if not ssh_connection or not ssh_connection.get_transport().is_active() or retry_count > 0:
+    #                     self.logger.info(f"Reconnecting SSH to node {node}")
+    #                     self.connect(
+    #                         address=node,
+    #                         is_bastion_server=True if node == self.bastion_server else False
+    #                     )
+    #                     ssh_connection = self.ssh_connections[node]
+                    
+    #                 if not supress_logs:
+    #                     self.logger.info(f"Executing command: {command}")
+    #                 stdin, stdout, stderr = ssh_connection.exec_command(command, timeout=timeout)
+
+    #                 output = []
+    #                 error = []
+
+    #                 # Read stdout and stderr dynamically if stream_callback is provided
+    #                 if stream_callback:
+    #                     while not stdout.channel.exit_status_ready():
+    #                         # Process stdout
+    #                         if stdout.channel.recv_ready():
+    #                             chunk = stdout.channel.recv(1024).decode()
+    #                             output.append(chunk)
+    #                             stream_callback(chunk, is_error=False)  # Callback for stdout
+
+    #                         # Process stderr
+    #                         if stderr.channel.recv_stderr_ready():
+    #                             chunk = stderr.channel.recv_stderr(1024).decode()
+    #                             error.append(chunk)
+    #                             stream_callback(chunk, is_error=True)  # Callback for stderr
+
+    #                         time.sleep(0.1)
+
+    #                     # Finalize any remaining output
+    #                     if stdout.channel.recv_ready():
+    #                         chunk = stdout.channel.recv(1024).decode()
+    #                         output.append(chunk)
+    #                         stream_callback(chunk, is_error=False)
+
+    #                     if stderr.channel.recv_stderr_ready():
+    #                         chunk = stderr.channel.recv_stderr(1024).decode()
+    #                         error.append(chunk)
+    #                         stream_callback(chunk, is_error=True)
+    #                 else:
+    #                     # Default behavior: Read the entire output at once
+    #                     output = stdout.read().decode()
+    #                     error = stderr.read().decode()
+
+    #                 # Combine the output into strings
+    #                 output = "".join(output) if isinstance(output, list) else output
+    #                 error = "".join(error) if isinstance(error, list) else error
+
+    #                 # Log the results
+    #                 if output:
+    #                     if not supress_logs:
+    #                         self.logger.info(f"Command output: {output}")
+    #                 if error:
+    #                     if not supress_logs:
+    #                         self.logger.error(f"Command error: {error}")
+
+    #                 if not output and not error:
+    #                     if not supress_logs:
+    #                         self.logger.warning(f"Command '{command}' executed but returned no output or error.")
+
+    #                 return output, error
+
+    #             except EOFError as e:
+    #                 self.logger.error(f"EOFError occurred while executing command '{command}': {e}. Retrying ({retry_count + 1}/{max_retries})...")
+    #                 retry_count += 1
+    #                 time.sleep(2)  # Short delay before retrying
+
+    #             except paramiko.SSHException as e:
+    #                 self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...")
+    #                 retry_count += 1
+    #                 time.sleep(2)  # Short delay before retrying
+
+    #             except paramiko.buffered_pipe.PipeTimeout as e:
+    #                 self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...")
+    #                 retry_count += 1
+    #                 time.sleep(2)  # Short delay before retrying
+
+    #             except Exception as e:
+    #                 self.logger.error(f"SSH command failed (General Exception): {e}. Retrying ({retry_count + 1}/{max_retries})...")
+    #                 retry_count += 1
+    #                 time.sleep(2)  # Short delay before retrying
+
+    #     # If we exhaust retries, return failure
+    #     self.logger.error(f"Failed to execute command '{command}' on node {node} after {max_retries} retries.")
+    #     return "", "Command failed after max retries"
 
     def exec_command(self, node, command, timeout=360, max_retries=3, stream_callback=None, supress_logs=False):
-        """Executes a command on a given machine with streaming output and retry mechanism.
-
-        Args:
-            node (str): Machine to run command on.
-            command (str): Command to run.
-            timeout (int): Timeout in seconds.
-            max_retries (int): Number of retries in case of failures.
-            stream_callback (callable, optional): A callback function for streaming output. Defaults to None.
-
-        Returns:
-            tuple: Final output and error strings after command execution.
         """
-        retry_count = 0
-        while retry_count < max_retries:
+        Execute a command with auto-reconnect (serialized per node), optional streaming,
+        and proper exit-status capture to reduce “ran but no output” confusion.
+        """
+        retry = 0
+        while retry < max_retries:
             with self.ssh_semaphore:
-                ssh_connection = self.ssh_connections.get(node)
+                # serialize reconnect attempts per node
+                lock = self._reconnect_locks[node]
+                with lock:
+                    ssh = self.ssh_connections.get(node)
+                    if not ssh or not ssh.get_transport() or not ssh.get_transport().is_active() or retry > 0:
+                        if not supress_logs:
+                            self.logger.info(f"Reconnecting SSH to node {node}")
+                        # if node is the bastion itself
+                        self.connect(node, is_bastion_server=(node == self.bastion_server))
+                        ssh = self.ssh_connections[node]
+
                 try:
-                    # Ensure the SSH connection is active, otherwise reconnect
-                    if not ssh_connection or not ssh_connection.get_transport().is_active() or retry_count > 0:
-                        self.logger.info(f"Reconnecting SSH to node {node}")
-                        self.connect(
-                            address=node,
-                            is_bastion_server=True if node == self.bastion_server else False
-                        )
-                        ssh_connection = self.ssh_connections[node]
-                    
                     if not supress_logs:
                         self.logger.info(f"Executing command: {command}")
-                    stdin, stdout, stderr = ssh_connection.exec_command(command, timeout=timeout)
+                    stdin, stdout, stderr = ssh.exec_command(command, timeout=timeout)
+                    output_chunks, error_chunks = [], []
 
-                    output = []
-                    error = []
-
-                    # Read stdout and stderr dynamically if stream_callback is provided
                     if stream_callback:
                         while not stdout.channel.exit_status_ready():
-                            # Process stdout
                             if stdout.channel.recv_ready():
-                                chunk = stdout.channel.recv(1024).decode()
-                                output.append(chunk)
-                                stream_callback(chunk, is_error=False)  # Callback for stdout
-
-                            # Process stderr
+                                chunk = stdout.channel.recv(8192).decode(errors="replace")
+                                output_chunks.append(chunk)
+                                stream_callback(chunk, is_error=False)
                             if stderr.channel.recv_stderr_ready():
-                                chunk = stderr.channel.recv_stderr(1024).decode()
-                                error.append(chunk)
-                                stream_callback(chunk, is_error=True)  # Callback for stderr
-
-                            time.sleep(0.1)
-
-                        # Finalize any remaining output
-                        if stdout.channel.recv_ready():
-                            chunk = stdout.channel.recv(1024).decode()
-                            output.append(chunk)
+                                chunk = stderr.channel.recv_stderr(8192).decode(errors="replace")
+                                error_chunks.append(chunk)
+                                stream_callback(chunk, is_error=True)
+                            time.sleep(0.05)
+
+                        # flush remaining
+                        while stdout.channel.recv_ready():
+                            chunk = stdout.channel.recv(8192).decode(errors="replace")
+                            output_chunks.append(chunk)
                             stream_callback(chunk, is_error=False)
-
-                        if stderr.channel.recv_stderr_ready():
-                            chunk = stderr.channel.recv_stderr(1024).decode()
-                            error.append(chunk)
+                        while stderr.channel.recv_stderr_ready():
+                            chunk = stderr.channel.recv_stderr(8192).decode(errors="replace")
+                            error_chunks.append(chunk)
                             stream_callback(chunk, is_error=True)
+
+                        exit_status = stdout.channel.recv_exit_status()
+                        out = "".join(output_chunks)
+                        err = "".join(error_chunks)
                     else:
-                        # Default behavior: Read the entire output at once
-                        output = stdout.read().decode()
-                        error = stderr.read().decode()
+                        out = stdout.read().decode(errors="replace")
+                        err = stderr.read().decode(errors="replace")
+                        exit_status = stdout.channel.recv_exit_status()
 
-                    # Combine the output into strings
-                    output = "".join(output) if isinstance(output, list) else output
-                    error = "".join(error) if isinstance(error, list) else error
+                    if (not supress_logs) and out:
+                        self.logger.info(f"Command output: {out.strip()[:2000]}")
+                    if (not supress_logs) and err:
+                        self.logger.error(f"Command error: {err.strip()[:2000]}")
 
-                    # Log the results
-                    if output:
-                        if not supress_logs:
-                            self.logger.info(f"Command output: {output}")
-                    if error:
-                        if not supress_logs:
-                            self.logger.error(f"Command error: {error}")
+                    if exit_status != 0 and not err:
+                        # some tools write nothing on stderr but non-zero exit
+                        err = f"Non-zero exit status: {exit_status}"
 
-                    if not output and not error:
+                    if not out and not err:
                         if not supress_logs:
                             self.logger.warning(f"Command '{command}' executed but returned no output or error.")
 
-                    return output, error
-
-                except EOFError as e:
-                    self.logger.error(f"EOFError occurred while executing command '{command}': {e}. Retrying ({retry_count + 1}/{max_retries})...")
-                    retry_count += 1
-                    time.sleep(2)  # Short delay before retrying
+                    return out, err
 
-                except paramiko.SSHException as e:
-                    self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...")
-                    retry_count += 1
-                    time.sleep(2)  # Short delay before retrying
-
-                except paramiko.buffered_pipe.PipeTimeout as e:
-                    self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...")
-                    retry_count += 1
-                    time.sleep(2)  # Short delay before retrying
+                except (EOFError, paramiko.SSHException, paramiko.buffered_pipe.PipeTimeout, socket.error) as e:
+                    retry += 1
+                    self.logger.error(f"SSH command failed ({type(e).__name__}): {e}. Retrying ({retry}/{max_retries})...")
+                    time.sleep(min(2 * retry, 5))
 
                 except Exception as e:
-                    self.logger.error(f"SSH command failed (General Exception): {e}. Retrying ({retry_count + 1}/{max_retries})...")
-                    retry_count += 1
-                    time.sleep(2)  # Short delay before retrying
+                    retry += 1
+                    self.logger.error(f"SSH command failed (General): {e}. Retrying ({retry}/{max_retries})...")
+                    time.sleep(min(2 * retry, 5))
 
-        # If we exhaust retries, return failure
         self.logger.error(f"Failed to execute command '{command}' on node {node} after {max_retries} retries.")
         return "", "Command failed after max retries"
 
-    
+
     def format_disk(self, node, device, fs_type="ext4"):
         """Format disk on the given node
 
@@ -362,14 +623,133 @@ def get_devices(self, node):
 
         return output.strip().split()
     
-    def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs):
-        """Run FIO Tests with given params and proper logging for MD5 error timestamp tracing.
+    # def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs):
+    #     """
+    #     Run FIO with optional 'ensure_running' that verifies process presence and retries start  up to N times.
+
+    #     kwargs:
+    #     - ensure_running: bool (default False)
+    #     - max_start_retries: int (default 3)
+    #     """
+    #     location = ""
+    #     if device:
+    #         location = f"--filename={device}"
+    #     if directory:
+    #         location = f"--directory={directory}"
+
+    #     runtime     = kwargs.get("runtime", 3600)
+    #     name        = kwargs.get("name", f"fio_{_rid(6)}")
+    #     ioengine    = kwargs.get("ioengine", "libaio")
+    #     iodepth     = kwargs.get("iodepth", 1)
+    #     time_based  = "--time_based" if kwargs.get("time_based", True) else ""
+    #     rw          = kwargs.get("rw", "randrw")
+    #     bs          = kwargs.get("bs", "4K")
+    #     size        = kwargs.get("size", "1G")
+    #     rwmixread   = kwargs.get("rwmixread", 70)
+    #     numjobs     = kwargs.get("numjobs", 2)
+    #     nrfiles     = kwargs.get("nrfiles", 8)
+    #     log_avg_ms  = kwargs.get("log_avg_msec", 1000)
+    #     output_fmt  = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else ''
+    #     output_file = f" --output={kwargs['output_file']} " if kwargs.get("output_file") else ''
+    #     iolog_base  = kwargs.get("iolog_file")
+
+    #     iolog_opt   = f"--write_iolog={iolog_base}" if iolog_base else ""
+    #     log_opt     = f"--log_avg_msec={log_avg_ms}" if log_avg_ms else ""
+
+    #     command = (
+    #         f"sudo fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} "
+    #         f"{time_based} --runtime={runtime} --rw={rw} --max_latency=20s --bs={bs} --size={size} --rwmixread={rwmixread} "
+    #         f"--verify=md5 --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} "
+    #         f"{log_opt} {iolog_opt} {output_fmt}{output_file}"
+    #     )
+    #     if kwargs.get("debug"):
+    #         command += " --debug=all"
+    #     if log_file:
+    #         command += f" > {log_file} 2>&1"
+
+    #     ensure_running   = bool(kwargs.get("ensure_running", False))
+    #     max_start_retries = int(kwargs.get("max_start_retries", 3))
+
+    #     launch_retries = 3
+    #     for attempt in range(1, launch_retries + 1):
+
+    #         try:
+    #             self.logger.info(f"Starting FIO on {node}: {name} → {location} (attempt {attempt}/{launch_retries})")
+    #             self.exec_command(node=node, command=f"sudo {command}", max_retries=2)
+    #             break
+    #         except Exception as e:
+    #             self.logger.error(f"FIO start failed: {e}")
+    #             if attempt == launch_retries:
+    #                 raise
+    #             time.sleep(1.0 * attempt)
+
+    #     # Ensure process is up (pgrep name)
+    #     start_retries = 6
+    #     for i in range(start_retries):
+    #         out, err  = self.exec_command(
+    #             node=node,
+    #             command=f"pgrep -fa 'fio.*{name}' || true",
+    #             max_retries=1,
+    #         )
+    #         if out.strip():
+    #             self.logger.info(f"FIO is running for {name}: {out.strip().splitlines()[0]}")
+    #             return
+    #         # Not running yet → small backoff and try again
+    #         time.sleep(2 + i)
+    #         # If still not, try re-launch quickly
+    #         if i >= 2:
+    #             self.logger.warning(f"FIO still not running for {name}; re-issuing start (try {i-1}/{start_retries-3})")
+    #             try:
+    #                 self.exec_command(node=node, command=f"sudo {command}", max_retries=1)
+    #             except Exception as e:
+    #                 self.logger.warning(f"Re-start attempt raised: {e}")
+
+    #     # If we get here, fio didn’t stick
+    #     raise RuntimeError(f"FIO failed to stay running for job {name} on {node}")
+
+        # def _is_running():
+        #     # Use pgrep on job name (fio --name=<name>) for a quick check
+        #     # Fall back to ps+grep if pgrep not present.
+        #     try:
+        #         out, _ = self.exec_command(node=node, command=f"pgrep -fl 'fio.*--name={name}'", max_retries=1)
+        #         return bool(out.strip())
+        #     except Exception:
+        #         out, _ = self.exec_command(node=node, command=f"ps ax | grep -E 'fio.*--name={name}' | grep -v grep || true", max_retries=1)
+        #         return bool(out.strip())
+
+        # # Try to start; handle EOF/channel close by reconnect+retry
+        # attempts = 0
+        # while True:
+        #     attempts += 1
+        #     try:
+        #         self.exec_command(node=node, command=command, max_retries=3)
+        #     except Exception as e:
+        #         # Channel/EOF during start is common in churn; retry a few times
+        #         if attempts < max_start_retries:
+        #             self.logger.error(f"FIO start error ({e}); retrying {attempts}/{max_start_retries} in 2s")
+        #             time.sleep(2)
+        #             continue
+        #         else:
+        #             raise
+
+        #     if not ensure_running:
+        #         return
+
+        #     # Verify started; retry if not
+        #     time.sleep(1.0)
+        #     if _is_running():
+        #         return
+
+        #     if attempts >= max_start_retries:
+        #         raise RuntimeError(f"FIO failed to start after {max_start_retries} attempts for job '{name}'")
+
+        #     self.logger.warning(f"FIO not detected running for '{name}'; retrying start {attempts}/{max_start_retries}")
+        #     time.sleep(1.0)
 
-        Args:
-            node (str): Node to perform ssh operation on
-            device (str): Device path. Defaults to None.
-            directory (str, optional): Directory to run test on. Defaults to None.
-            log_file (str, optional): Log file to redirect output to. Defaults to None.
+    def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs):
+        """
+        Start FIO in a detached tmux session so it survives SSH channel drops during fast outages.
+        Verifies process presence and re-kicks a few times if missing.
         """
         location = ""
         if device:
@@ -377,72 +757,63 @@ def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwarg
         if directory:
             location = f"--directory={directory}"
 
-        runtime = kwargs.get("runtime", 3600)
-        rw = kwargs.get("rw", "randrw")
-        name = kwargs.get("name", "test")
-        ioengine = kwargs.get("ioengine", "libaio")
-        iodepth = kwargs.get("iodepth", 1)
-        bs = kwargs.get("bs", "4k")
-        rwmixread = kwargs.get("rwmixread", 70)
-        size = kwargs.get("size", "10MiB")
-        time_based = "--time_based" if kwargs.get("time_based", True) else ""
-        numjobs = kwargs.get("numjobs", 1)
-        nrfiles = kwargs.get("nrfiles", 1)
-
-        output_format = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else ''
+        runtime     = kwargs.get("runtime", 3600)
+        name        = kwargs.get("name", f"fio_{_rid(6)}")
+        ioengine    = kwargs.get("ioengine", "libaio")
+        iodepth     = kwargs.get("iodepth", 1)
+        time_based  = "--time_based" if kwargs.get("time_based", True) else ""
+        rw          = kwargs.get("rw", "randrw")
+        bs          = kwargs.get("bs", "4K")
+        size        = kwargs.get("size", "1G")
+        rwmixread   = kwargs.get("rwmixread", 70)
+        numjobs     = kwargs.get("numjobs", 2)
+        nrfiles     = kwargs.get("nrfiles", 8)
+        log_avg_ms  = kwargs.get("log_avg_msec", 1000)
+        max_latency  = kwargs.get("max_latency", "20s")
+        use_latency = kwargs.get("use_latency", True)
+        output_fmt  = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else ''
         output_file = f" --output={kwargs['output_file']} " if kwargs.get("output_file") else ''
+        iolog_base  = kwargs.get("iolog_file")
 
-        log_avg_msec = kwargs.get("log_avg_msec", 1000)
-        log_avg_msec_opt = f"--log_avg_msec={log_avg_msec}" if log_avg_msec else ""
-
-        iolog_base = kwargs.get("iolog_file", None)
-        iolog_opt = f"--write_iolog={iolog_base}" if iolog_base else ""
-        verify_md5 = "--verify=md5" if iodepth == 1 else ""
+        iolog_opt   = f"--write_iolog={iolog_base}" if iolog_base else ""
+        log_opt     = f"--log_avg_msec={log_avg_ms}" if log_avg_ms else ""
+        latency = f" --max_latency={max_latency}" if use_latency else ""
 
-        command = (
-            f"sudo fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} "
-            f"{time_based} --runtime={runtime} --rw={rw} --max_latency=30s --bs={bs} --size={size} --rwmixread={rwmixread} "
-            f"{verify_md5} --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} "
-            f"{log_avg_msec_opt} {iolog_opt} "
-            f"{output_format}{output_file}"
-        )
-        # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        # log_file = log_file or f"/tmp/{name}_{timestamp}.log"
+        # raw fio command
+        fio_cmd = (
+            f"fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} "
+            f"{time_based} --runtime={runtime} --rw={rw} {latency} --bs={bs} --size={size} --rwmixread={rwmixread} "
+            f"--verify=md5 --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} "
+            f"{log_opt} {iolog_opt} {output_fmt}{output_file}"
+        ).strip()
 
         if kwargs.get("debug"):
-            command += " --debug=all"
+            fio_cmd += " --debug=all"
 
+        # run fio under tmux so HUP/SSH channel drops don't kill it
+        session = f"fio_{name}"
         if log_file:
-            command += f" > {log_file} 2>&1"
-        
-        # else:
-        #     command += " --debug=verify"
-        
-        # awk_ts = " | awk '{ print strftime(\"[%Y-%m-%d %H:%M:%S]\"), $0; fflush(); }' | "
-        # command += awk_ts
-        # command += f"tee {log_file}"
-
-        self.logger.info(f"Executing FIO command:\n{command}")
+            fio_cmd = f"{fio_cmd} > {log_file} 2>&1"
+
+        start_cmd = f"sudo tmux new-session -d -s {session} \"{fio_cmd}\" || sudo tmux kill-session -t {session} 2>/dev/null || true; sudo tmux new-session -d -s {session} \"{fio_cmd}\""
+        self.logger.info(f"Starting FIO on {node}: {name} in tmux session '{session}'")
+        self.exec_command(node=node, command=start_cmd, max_retries=2)
+
+        # Ensure process is up: check tmux & pgrep
+        for i in range(8):
+            out, _ = self.exec_command(node=node, command=f"pgrep -fa 'fio.*{name}' || true", max_retries=1, supress_logs=True)
+            tmux_ok, _ = self.exec_command(node=node, command=f"sudo tmux has-session -t {session} 2>/dev/null || echo MISSING", max_retries=1, supress_logs=True)
+            if out.strip() and "MISSING" not in tmux_ok:
+                self.logger.info(f"FIO is running for {name}: {out.strip().splitlines()[0]}")
+                return
+            if i >= 2:
+                self.logger.warning(f"FIO not detected yet for {name}; re-issuing start (try {i-1}/5)")
+                self.exec_command(node=node, command=start_cmd, max_retries=1, supress_logs=True)
+            time.sleep(2 + i)
 
-        start_time = time.time()
-        output, error = self.exec_command(node=node, command=command, timeout=runtime * 2)
-        end_time = time.time()
-
-        total_time = end_time - start_time
-        self.fio_runtime[name] = start_time
-        self.logger.info(f"Total time taken to run the command: {total_time:.2f} seconds")
-
-        # Return all generated iolog files (one per job)
-        iolog_files = [f"{iolog_base}.{i}" for i in range(numjobs)]
-        return {
-            "output": output,
-            "error": error,
-            "start_time": start_time,
-            "end_time": end_time,
-            "iolog_files": iolog_files,
-        }
+        raise RuntimeError(f"FIO failed to stay running for job {name} on {node}")
 
-    
+        
     def find_process_name(self, node, process_name, return_pid=False):
         if return_pid:
             command = "ps -ef | grep -i '%s' | awk '{print $2}'" % process_name
@@ -700,15 +1071,35 @@ def get_lvol_id(self, node, lvol_name):
         return output.strip().split()
     
     def get_snapshot_id(self, node, snapshot_name):
-        cmd = "%s snapshot list | grep -i '%s ' | awk '{print $2}'" % (self.base_cmd, snapshot_name)
-        output, error = self.exec_command(node=node, command=cmd)
+        start = time.time()
+        deadline = start + 600  # 10 minutes
+        wait_interval = 10       # seconds between checks
+        snapshot_id = ""
+
+        while time.time() < deadline:
+            cmd = "%s snapshot list | grep -i '%s ' | awk '{print $2}'" % (self.base_cmd, snapshot_name)
+            output, error = self.exec_command(node=node, command=cmd)
+            if output.strip():
+                if hasattr(self, "logger"):
+                    self.logger.info(f"Snapshot '{snapshot_name}' is visible with ID: {snapshot_id}")
+                break
+            time.sleep(wait_interval)
+
+        if not output.strip():
+            if hasattr(self, "logger"):
+                self.logger.error(f"Timed out waiting for snapshot '{snapshot_name}' to appear within 10 minutes.")
 
         return output.strip()
 
     def add_snapshot(self, node, lvol_id, snapshot_name):
         cmd = f"{self.base_cmd} -d snapshot add {lvol_id} {snapshot_name}"
         output, error = self.exec_command(node=node, command=cmd)
-        return output, error
+
+        snapshot_id = self.get_snapshot_id(node=node, snapshot_name=snapshot_name)
+
+        if not snapshot_id:
+            if hasattr(self, "logger"):
+                self.logger.error(f"Timed out waiting for snapshot '{snapshot_name}' to appear within 10 minutes.")
     
     def add_clone(self, node, snapshot_id, clone_name):
         cmd = f"{self.base_cmd} -d snapshot clone {snapshot_id} {clone_name}"
@@ -971,30 +1362,81 @@ def get_active_interfaces(self, node_ip):
             return []
         
 
-    def disconnect_all_active_interfaces(self, node_ip, interfaces, reconnect_time=300):
-        """
-        Disconnect all active network interfaces on a node in a single SSH call.
+    # def disconnect_all_active_interfaces(self, node_ip, interfaces, reconnect_time=300):
+    #     """
+    #     Disconnect all active network interfaces on a node in a single SSH call.
+
+    #     Args:
+    #         node_ip (str): IP of the target node.
+    #         interfaces (list): List of active network interfaces to disconnect.
+    #     """
+    #     if not interfaces:
+    #         self.logger.warning(f"No active interfaces to disconnect on node {node_ip}.")
+    #         return
+
+    #     # Combine disconnect commands for all interfaces
+    #     disconnect_cmds = " && ".join([f"sudo nmcli connection down {iface}" for iface in interfaces])
+    #     reconnect_cmds = " && ".join([f"sudo nmcli connection up {iface}" for iface in interfaces])
+
+    #     cmd = (
+    #         f'nohup sh -c "{disconnect_cmds} && sleep {reconnect_time} && {reconnect_cmds}" &'
+    #     )
+    #     self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}")
+    #     try:
+    #         self.exec_command(node_ip, cmd)
+    #     except Exception as e:
+    #         self.logger.error(f"Failed to execute combined disconnect command on {node_ip}: {e}")
+
+    def _ping_once(self, ip: str, count: int = 1, wait: int = 1) -> bool:
+        try:
+            # Use system ping; True means "ping success"
+            res = subprocess.run(["ping", "-c", str(count), "-W", str(wait), ip],
+                                 stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            return res.returncode == 0
+        except Exception:
+            return False
 
-        Args:
-            node_ip (str): IP of the target node.
-            interfaces (list): List of active network interfaces to disconnect.
+    def disconnect_all_active_interfaces(
+        self,
+        node_ip: str,
+        interfaces: list[str],
+        duration_secs: int = 300,
+        max_tries: int = 3,
+    ):
+        """
+        Bring all given interfaces DOWN, verify outage by ping, keep for duration, then bring them UP.
+        Fire-and-forget style; robust against brief SSH flaps.
         """
         if not interfaces:
-            self.logger.warning(f"No active interfaces to disconnect on node {node_ip}.")
+            self.logger.info(f"No active interfaces provided for {node_ip}; skipping NIC down.")
             return
 
-        # Combine disconnect commands for all interfaces
-        disconnect_cmds = " && ".join([f"sudo nmcli connection down {iface}" for iface in interfaces])
-        reconnect_cmds = " && ".join([f"sudo nmcli connection up {iface}" for iface in interfaces])
+        down_cmd = " && ".join([f"nmcli connection down {i}" for i in interfaces])
+        up_cmd   = " && ".join([f"nmcli connection up {i}" for i in interfaces])
+        cmd = f'nohup sh -c "{down_cmd} && sleep {duration_secs} && {up_cmd}" &'
 
-        cmd = (
-            f'nohup sh -c "{disconnect_cmds} && sleep {reconnect_time} && {reconnect_cmds}" &'
-        )
-        self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}")
         try:
-            self.exec_command(node_ip, cmd)
+            self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}")
+            out, err = self.exec_command(node=node_ip, command=cmd, max_retries=1, timeout=20)
+            if err:
+                raise Exception(err)
         except Exception as e:
-            self.logger.error(f"Failed to execute combined disconnect command on {node_ip}: {e}")
+            self.logger.info(f"Command: {cmd}, error: {e}! Checking pings!!")
+
+        # Verify outage begins (best-effort). If ping still works, attempt to issue 'down' again.
+        time.sleep(5)
+        tries = 0
+        attempts = 10
+        while self._ping_once(node_ip) and attempts > 0:
+            tries += 1
+            if tries >= max_tries:
+                self.logger.warning(f"Ping to {node_ip} still responding after NIC down attempts; continuing anyway.")
+                break
+            self.logger.info(f"Ping to {node_ip} still alive; retrying NIC down...")
+            # re-run only the DOWN part (don’t append sleep again to avoid stacking)
+            self.exec_command(node=node_ip, command=cmd, max_retries=2)
+            time.sleep(3)
+            attempts -= 1
 
     def check_tmux_installed(self, node_ip):
         """Check tmux installation
@@ -1420,132 +1862,263 @@ def dump_lvstore(self, node_ip, storage_node_id):
             self.logger.error(f"Failed to dump lvstore on {node_ip}: {e}")
             return None
         
-    def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path):
-        """
-        Fetch distrib names using bdev_get_bdevs RPC, generate and execute RPC JSON,
-        and copy logs from SPDK container.
+    # def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path):
+    #     """
+    #     Fetch distrib names using bdev_get_bdevs RPC, generate and execute RPC JSON,
+    #     and copy logs from SPDK container.
+
+    #     Args:
+    #         storage_node_ip (str): IP of the storage node
+    #         storage_node_id (str): ID of the storage node
+    #     """
+    #     self.logger.info(f"Fetching distrib logs for Storage Node ID: {storage_node_id} on {storage_node_ip}")
+
+    #     # Step 1: Find the SPDK container
+    #     find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$'"
+    #     container_name_output, _ = self.exec_command(storage_node_ip, find_container_cmd)
+    #     container_name = container_name_output.strip()
+
+    #     if not container_name:
+    #         self.logger.warning(f"No SPDK container found on {storage_node_ip}")
+    #         return
+
+    #     # Step 2: Get bdev_get_bdevs output
+    #     # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'"
+    #     # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd)
+
+    #     # if error:
+    #     #     self.logger.error(f"Error running bdev_get_bdevs: {error}")
+    #     #     return
+
+    #     # # Step 3: Save full output to local file
+    #     # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S")
+    #     # raw_output_path = f"{Path.home()}/bdev_output_{storage_node_ip}_{timestamp}.json"
+    #     # with open(raw_output_path, "w") as f:
+    #     #     f.write(bdev_output)
+    #     # self.logger.info(f"Saved raw bdev_get_bdevs output to {raw_output_path}")
+
+    #     timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S")
+    #     base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs/"
+
+    #     cmd = f"sudo mkdir -p '{base_path}'"
+    #     self.exec_command(storage_node_ip, cmd)
+
+    #     remote_output_path = f"bdev_output_{storage_node_ip}_{timestamp}.json"
+
+    #     # 1. Run to capture output into a variable (for parsing)
+    #     bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs'"
+    #     bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd)
+
+    #     if error:
+    #         self.logger.error(f"Error running bdev_get_bdevs: {error}")
+    #         return
+
+    #     # 2. Run again to save output on host machine (audit trail)
+    #     bdev_save_cmd = (
+    #         f"sudo bash -c \"docker exec {container_name} python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs > {remote_output_path}\"")
+
+    #     self.exec_command(storage_node_ip, bdev_save_cmd)
+    #     self.logger.info(f"Saved bdev_get_bdevs output to {remote_output_path} on {storage_node_ip}")
+
+
+    #     # Step 4: Extract unique distrib names
+    #     try:
+    #         bdevs = json.loads(bdev_output)
+    #         distribs = list({bdev['name'] for bdev in bdevs if bdev['name'].startswith('distrib_')})
+    #     except json.JSONDecodeError as e:
+    #         self.logger.error(f"JSON parsing failed: {e}")
+    #         return
+
+    #     if not distribs:
+    #         self.logger.warning("No distrib names found in bdev_get_bdevs output.")
+    #         return
+
+    #     self.logger.info(f"Distributions found: {distribs}")
+
+    #     # Step 5: Process each distrib
+    #     for distrib in distribs:
+    #         self.logger.info(f"Processing distrib: {distrib}")
+    #         rpc_json = {
+    #             "subsystems": [
+    #                 {
+    #                     "subsystem": "distr",
+    #                     "config": [
+    #                         {
+    #                             "method": "distr_debug_placement_map_dump",
+    #                             "params": {"name": distrib}
+    #                         }
+    #                     ]
+    #                 }
+    #             ]
+    #         }
+
+    #         rpc_json_str = json.dumps(rpc_json)
+    #         remote_json_path = "/tmp/stack.json"
+
+    #         # Save JSON file remotely
+    #         create_json_command = f"echo '{rpc_json_str}' | sudo tee {remote_json_path}"
+    #         self.exec_command(storage_node_ip, create_json_command)
+
+    #         # Copy into container
+    #         copy_json_command = f"sudo docker cp {remote_json_path} {container_name}:{remote_json_path}"
+    #         self.exec_command(storage_node_ip, copy_json_command)
+
+    #         # Run RPC inside container
+    #         rpc_command = f"sudo docker exec {container_name} bash -c 'python scripts/rpc_sock.py {remote_json_path} /mnt/ramdisk/{container_name}/spdk.sock'"
+    #         self.exec_command(storage_node_ip, rpc_command)
+
+    #         # Find and copy log
+    #         find_log_command = f"sudo docker exec {container_name} ls /tmp/ | grep {distrib}"
+    #         log_file_name, _ = self.exec_command(storage_node_ip, find_log_command)
+    #         log_file_name = log_file_name.strip().replace("\r", "").replace("\n", "")
+
+    #         if not log_file_name:
+    #             self.logger.error(f"No log file found for distrib {distrib}.")
+    #             continue
+
+    #         log_file_path = f"/tmp/{log_file_name}"
+    #         local_log_path = f"{base_path}/{log_file_name}_{storage_node_ip}_{timestamp}"
+    #         copy_log_cmd = f"sudo docker cp {container_name}:{log_file_path} {local_log_path}"
+    #         self.exec_command(storage_node_ip, copy_log_cmd)
+
+    #         self.logger.info(f"Fetched log for {distrib}: {local_log_path}")
+
+    #         # Clean up
+    #         delete_log_cmd = f"sudo docker exec {container_name} rm -f {log_file_path}"
+    #         self.exec_command(storage_node_ip, delete_log_cmd)
+
+    #     self.logger.info("All distrib logs retrieved successfully.")
 
-        Args:
-            storage_node_ip (str): IP of the storage node
-            storage_node_id (str): ID of the storage node
-        """
+    def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path):
         self.logger.info(f"Fetching distrib logs for Storage Node ID: {storage_node_id} on {storage_node_ip}")
 
-        # Step 1: Find the SPDK container
-        find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$'"
-        container_name_output, _ = self.exec_command(storage_node_ip, find_container_cmd)
-        container_name = container_name_output.strip()
-
+        # 0) Find SPDK container name
+        find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' || true"
+        container_name_out, _ = self.exec_command(storage_node_ip, find_container_cmd)
+        container_name = (container_name_out or "").strip()
         if not container_name:
             self.logger.warning(f"No SPDK container found on {storage_node_ip}")
             return
 
-        # Step 2: Get bdev_get_bdevs output
-        # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'"
-        # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd)
-
-        # if error:
-        #     self.logger.error(f"Error running bdev_get_bdevs: {error}")
-        #     return
-
-        # # Step 3: Save full output to local file
-        # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S")
-        # raw_output_path = f"{Path.home()}/bdev_output_{storage_node_ip}_{timestamp}.json"
-        # with open(raw_output_path, "w") as f:
-        #     f.write(bdev_output)
-        # self.logger.info(f"Saved raw bdev_get_bdevs output to {raw_output_path}")
-
-        timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S")
-        base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs/"
-
-        cmd = f"sudo mkdir -p '{base_path}'"
-        self.exec_command(storage_node_ip, cmd)
-
-        remote_output_path = f"bdev_output_{storage_node_ip}_{timestamp}.json"
-
-        # 1. Run to capture output into a variable (for parsing)
-        bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'"
-        bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd)
-
-        if error:
-            self.logger.error(f"Error running bdev_get_bdevs: {error}")
+        # 1) Get bdevs via correct sock
+        timestamp = datetime.now().strftime("%Y%m%d_%H-%M-%S")
+        base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs"
+        self.exec_command(storage_node_ip, f"sudo mkdir -p '{base_path}' && sudo chmod -R 777 '{base_path}'")
+        bdev_cmd = (
+            f"sudo docker exec {container_name} bash -lc "
+            f"\"python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs\""
+        )
+        bdev_output, bdev_err = self.exec_command(storage_node_ip, bdev_cmd)
+        if (bdev_err and bdev_err.strip()) and not bdev_output:
+            self.logger.error(f"bdev_get_bdevs error on {storage_node_ip}: {bdev_err.strip()}")
             return
 
-        # 2. Run again to save output on host machine (audit trail)
-        bdev_save_cmd = (
-            f"sudo bash -c \"docker exec {container_name} python spdk/scripts/rpc.py bdev_get_bdevs > {remote_output_path}\"")
-
-        self.exec_command(storage_node_ip, bdev_save_cmd)
-        self.logger.info(f"Saved bdev_get_bdevs output to {remote_output_path} on {storage_node_ip}")
-
-
-        # Step 4: Extract unique distrib names
+        # Parse distrib names
         try:
             bdevs = json.loads(bdev_output)
-            distribs = list({bdev['name'] for bdev in bdevs if bdev['name'].startswith('distrib_')})
+            distribs = sorted({
+                b.get("name", "")
+                for b in bdevs
+                if isinstance(b, dict) and str(b.get("name","")).startswith("distrib_")
+            })
         except json.JSONDecodeError as e:
-            self.logger.error(f"JSON parsing failed: {e}")
+            self.logger.error(f"JSON parsing failed on {storage_node_ip}: {e}")
             return
-
         if not distribs:
-            self.logger.warning("No distrib names found in bdev_get_bdevs output.")
+            self.logger.warning(f"No distrib_* bdevs found on {storage_node_ip}.")
+            return
+        self.logger.info(f"[{storage_node_ip}] Distributions: {distribs}")
+
+        # 2) Run multiple docker exec in parallel from ONE SSH exec
+        distrib_list_str = " ".join(shlex.quote(d) for d in distribs)
+        remote_tar = f"/tmp/distrib_logs_{timestamp}.tar.gz"
+
+        # IMPORTANT: This script runs on the HOST and spawns many `docker exec ... &` in parallel.
+        # It throttles with MAXJ, waits, then tars outputs from /tmp inside the container into one tarball on the host.
+        remote_script = f"""\
+set -euo pipefail
+CN={shlex.quote(container_name)}
+SOCK="/mnt/ramdisk/$CN/spdk.sock"
+TS="{timestamp}"
+MAXJ=8
+WORKDIR_HOST="{base_path}"
+mkdir -p "$WORKDIR_HOST"
+
+# Make a temporary host folder to collect per-distrib files copied out of the container
+HOST_STAGING="/tmp/distrib_host_collect_$TS"
+mkdir -p "$HOST_STAGING"
+
+pids=()
+
+for d in {distrib_list_str}; do
+  (
+    # Build JSON on host then copy into container (avoids many ssh execs)
+    JF="/tmp/stack_${{d}}.json"
+    cat > "$JF" <<'EOF_JSON'
+{{
+  "subsystems": [
+    {{
+      "subsystem": "distr",
+      "config": [
+        {{
+          "method": "distr_debug_placement_map_dump",
+          "params": {{"name": "__DIST__"}}
+        }}
+      ]
+    }}
+  ]
+}}
+EOF_JSON
+    # substitute distrib name
+    sed -i "s/__DIST__/$d/g" "$JF"
+
+    # Copy JSON into container
+    sudo docker cp "$JF" "$CN:/tmp/stack_${{d}}.json"
+
+    # Run rpc inside container (socket path respected)
+    sudo docker exec "$CN" bash -lc "python scripts/rpc_sock.py /tmp/stack_${{d}}.json {shlex.quote('/mnt/ramdisk/'+container_name+'/spdk.sock')} > /tmp/rpc_${{d}}.log 2>&1 || true"
+
+    # Copy any files for this distrib out to host staging (rpc log + any matching /tmp/*d*)
+    sudo docker cp "$CN:/tmp/rpc_${{d}}.log" "$HOST_STAGING/rpc_${{d}}.log" 2>/dev/null || true
+    # try to pull any distrib-related artifacts
+    for f in $(sudo docker exec "$CN" bash -lc "ls /tmp/ 2>/dev/null | grep -F \"$d\" || true"); do
+      sudo docker cp "$CN:/tmp/$f" "$HOST_STAGING/$f" 2>/dev/null || true
+    done
+
+    # cleanup container temp for this distrib
+    sudo docker exec "$CN" bash -lc "rm -f /tmp/stack_${{d}}.json /tmp/rpc_${{d}}.log" || true
+    rm -f "$JF" || true
+  ) &
+
+  # throttle parallel jobs
+  while [ "$(jobs -rp | wc -l)" -ge "$MAXJ" ]; do sleep 0.2; done
+done
+
+# Wait for all background jobs
+wait
+
+# Tar once on host
+tar -C "$HOST_STAGING" -czf {shlex.quote(remote_tar)} . 2>/dev/null || true
+
+# Move artifacts to final location
+mv -f {shlex.quote(remote_tar)} "$WORKDIR_HOST/" || true
+
+# Also copy loose files (for convenience) then clean staging
+cp -rf "$HOST_STAGING"/. "$WORKDIR_HOST"/ 2>/dev/null || true
+rm -rf "$HOST_STAGING" || true
+
+echo "$WORKDIR_HOST/{os.path.basename(remote_tar)}"
+"""
+
+        run_many_cmd = "bash -lc " + shlex.quote(remote_script)
+        tar_out, tar_err = self.exec_command(storage_node_ip, run_many_cmd)
+        if (tar_err and tar_err.strip()) and not tar_out:
+            self.logger.error(f"[{storage_node_ip}] Parallel docker-exec script error: {tar_err.strip()}")
             return
 
-        self.logger.info(f"Distributions found: {distribs}")
-
-        # Step 5: Process each distrib
-        for distrib in distribs:
-            self.logger.info(f"Processing distrib: {distrib}")
-            rpc_json = {
-                "subsystems": [
-                    {
-                        "subsystem": "distr",
-                        "config": [
-                            {
-                                "method": "distr_debug_placement_map_dump",
-                                "params": {"name": distrib}
-                            }
-                        ]
-                    }
-                ]
-            }
-
-            rpc_json_str = json.dumps(rpc_json)
-            remote_json_path = "/tmp/stack.json"
-
-            # Save JSON file remotely
-            create_json_command = f"echo '{rpc_json_str}' | sudo tee {remote_json_path}"
-            self.exec_command(storage_node_ip, create_json_command)
-
-            # Copy into container
-            copy_json_command = f"sudo docker cp {remote_json_path} {container_name}:{remote_json_path}"
-            self.exec_command(storage_node_ip, copy_json_command)
-
-            # Run RPC inside container
-            rpc_command = f"sudo docker exec {container_name} bash -c 'python scripts/rpc_sock.py {remote_json_path}'"
-            self.exec_command(storage_node_ip, rpc_command)
-
-            # Find and copy log
-            find_log_command = f"sudo docker exec {container_name} ls /tmp/ | grep {distrib}"
-            log_file_name, _ = self.exec_command(storage_node_ip, find_log_command)
-            log_file_name = log_file_name.strip().replace("\r", "").replace("\n", "")
-
-            if not log_file_name:
-                self.logger.error(f"No log file found for distrib {distrib}.")
-                continue
-
-            log_file_path = f"/tmp/{log_file_name}"
-            local_log_path = f"{base_path}/{log_file_name}_{storage_node_ip}_{timestamp}"
-            copy_log_cmd = f"sudo docker cp {container_name}:{log_file_path} {local_log_path}"
-            self.exec_command(storage_node_ip, copy_log_cmd)
-
-            self.logger.info(f"Fetched log for {distrib}: {local_log_path}")
-
-            # Clean up
-            delete_log_cmd = f"sudo docker exec {container_name} rm -f {log_file_path}"
-            self.exec_command(storage_node_ip, delete_log_cmd)
+        final_tar = (tar_out or "").strip().splitlines()[-1] if tar_out else f"{base_path}/{os.path.basename(remote_tar)}"
+        self.logger.info(f"[{storage_node_ip}] Distrib logs saved: {base_path} (tar: {final_tar})")
 
-        self.logger.info("All distrib logs retrieved successfully.")
 
     def clone_mount_gen_uuid(self, node, device):
         """Repair the XFS filesystem and generate a new UUID.
@@ -1722,8 +2295,8 @@ def start_netstat_dmesg_logging(self, node_ip, log_dir):
 
         self.exec_command(node_ip, f"sudo tmux new-session -d -s netstat_log 'bash -c \"while true; do netstat -s | grep \\\"segments dropped\\\" >> {netstat_log}; sleep 5; done\"'")
         self.exec_command(node_ip, f"sudo tmux new-session -d -s dmesg_log 'bash -c \"while true; do sudo dmesg | grep -i \\\"tcp\\\" >> {dmesg_log}; sleep 5; done\"'")
-        self.exec_command(node_ip, f"sudo tmux new-session -d -s journalctl_log 'bash -c \"while true; do sudo journalctl -k | grep -i \\\"tcp\\\" >> {journalctl_log}; sleep 5; done\"'")
-
+        self.exec_command(node_ip, f"sudo tmux new-session -d -s journalctl_log 'bash -c \"while true; do sudo journalctl -k --no-tail | grep -i \\\"tcp\\\" >> {journalctl_log}; sleep 5; done\"'")
+                
     def reset_iptables_in_spdk(self, node_ip):
         """
         Resets iptables rules inside the SPDK container on a given node.
@@ -1915,6 +2488,7 @@ def start_resource_monitors(self, node_ip, log_dir):
         root_log = f"{log_dir}/root_partition_usage_{node_ip}_{timestamp}.txt"
         docker_mem_log = f"{log_dir}/docker_mem_usage_{node_ip}_{timestamp}.txt"
         system_mem_log = f"{log_dir}/system_memory_usage_{node_ip}_{timestamp}.txt"
+        docker_stats_logs = f"{log_dir}/docker_stats_usage_{node_ip}_{timestamp}.txt"
 
         # Ensure log directory exists and is writable
         self.exec_command(node_ip, f"sudo mkdir -p {log_dir} && sudo chmod 777 {log_dir}")
@@ -1939,14 +2513,29 @@ def start_resource_monitors(self, node_ip, log_dir):
         'bash -c "while true; do date >> {system_mem_log}; free -h >> {system_mem_log}; echo >> {system_mem_log}; sleep 10; done"'
         """
 
+        docker_stats_cmd = f"""
+        sudo tmux new-session -d -s docker_stats_all \
+        'bash -c "while true; do date >> {docker_stats_logs}; docker stats --no-stream >> {docker_stats_logs}; echo >> {docker_stats_logs}; sleep 10; done"'
+        """
+
         self.exec_command(node_ip, df_cmd)
         self.exec_command(node_ip, docker_cmd)
         self.exec_command(node_ip, system_cmd)
+        self.exec_command(node_ip, docker_stats_cmd)
 
-        self.logger.info(f"Started root partition, container memory, and system memory logging on {node_ip}")
+        self.logger.info(f"Started root partition, container memory, docker stats and system memory logging on {node_ip}")
+    
+    def cluster_list(self, node_ip, cluster_id):
+        """Sets cluster in suspended state
 
+        Args:
+            node_ip (str): Mgmt Node IP to run command on
+            cluster_id (str): Cluster id to put in suspended state
+        """
+        cmd = f"{self.base_cmd} cluster list"
+        output, _ = self.exec_command(node_ip, cmd)
+        return output.strip()
 
-    
     def suspend_cluster(self, node_ip, cluster_id):
         """Sets cluster in suspended state
 
@@ -1995,7 +2584,7 @@ def ensure_nfs_mounted(self, node, nfs_server, nfs_path, mount_point, is_local =
         """
         check_cmd = f"mount | grep -w '{mount_point}'"
         mount_cmd = f"sudo mkdir -p {mount_point} && sudo mount -t nfs {nfs_server}:{nfs_path} {mount_point}"
-        install_check_cmd = "dnf list installed nfs-util"
+        install_check_cmd = "dnf list installed nfs-utils"
         install_cmd = "sudo dnf install -y nfs-utils"
 
         try:
@@ -2300,3 +2889,9 @@ def stop_log_monitor(self):
             self._monitor_stop_flag.set()
             self._monitor_thread.join(timeout=10)
             print("K8s log monitor thread stopped.")
+
+def _rid(n=6):
+    import string, random
+    letters = string.ascii_uppercase
+    digits = string.digits
+    return random.choice(letters) + ''.join(random.choices(letters + digits, k=n-1))

From 24d6ced0c56a8b06b9eb6b60deae90af843cf431 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Fri, 5 Dec 2025 14:44:49 +0300
Subject: [PATCH 50/68] Fix sn list apiv2 response _2 (#807)

---
 simplyblock_web/api/v2/dtos.py         | 10 ++++++----
 simplyblock_web/api/v2/storage_node.py |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py
index 54c1b5b01..f8ba77ae7 100644
--- a/simplyblock_web/api/v2/dtos.py
+++ b/simplyblock_web/api/v2/dtos.py
@@ -151,16 +151,18 @@ def from_model(model: SnapShot, request: Request, cluster_id, pool_id, volume_id
 
 
 class StorageNodeDTO(BaseModel):
-    id: UUID
+    uuid: UUID
     status: str
-    ip: IPv4Address
+    mgmt_ip: IPv4Address
+    health_check: bool
 
     @staticmethod
     def from_model(model: StorageNode):
         return StorageNodeDTO(
-            id=UUID(model.get_id()),
+            uuid=UUID(model.get_id()),
             status=model.status,
-            ip=IPv4Address(model.mgmt_ip),
+            mgmt_ip=IPv4Address(model.mgmt_ip),
+            health_check=model.health_check,
         )
 
 
diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py
index d1aec59be..e612d7177 100644
--- a/simplyblock_web/api/v2/storage_node.py
+++ b/simplyblock_web/api/v2/storage_node.py
@@ -21,9 +21,9 @@
 
 
 @api.get('/', name='clusters:storage-nodes:list')
-def list(cluster: Cluster) -> List[dict]:
+def list(cluster: Cluster) -> List[StorageNodeDTO]:
     return [
-        storage_node.to_dict()
+        StorageNodeDTO.from_model(storage_node)
         for storage_node
         in db.get_storage_nodes_by_cluster_id(cluster.get_id())
     ]

From d1163e32657f800765fc08c4e09551b1da4daa65 Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Mon, 8 Dec 2025 10:20:02 +0100
Subject: [PATCH 51/68] Update cluster.py (#808)

---
 simplyblock_web/api/v1/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py
index f4eb2e690..2447cf958 100644
--- a/simplyblock_web/api/v1/cluster.py
+++ b/simplyblock_web/api/v1/cluster.py
@@ -73,7 +73,7 @@ def create_first_cluster():
             return utils.get_response_error("blk_size can be 512 or 4096", 400)
         else:
             blk_size = cl_data['blk_size']
-    page_size_in_blocks = cl_data.get('distr_ndcs', 2097152)
+    page_size_in_blocks = cl_data.get('page_size_in_blocks', 2097152)
     distr_ndcs = cl_data.get('distr_ndcs', 1)
     distr_npcs = cl_data.get('distr_npcs', 1)
     distr_bs = cl_data.get('distr_bs', 4096)

From d1636621037e385a11af6a2d6997d188c80f73ae Mon Sep 17 00:00:00 2001
From: Geoffrey Israel <israelgeoffrey13@gmail.com>
Date: Mon, 8 Dec 2025 13:50:22 +0100
Subject: [PATCH 52/68] Fdb health check (#809)

* check fdb connection string

* check fdb connection string

* remove fdb cluster uuid

* removed simplyblock crd
---
 simplyblock_web/api/v1/__init__.py | 31 ++++++++++++++++++++++++++++++
 simplyblock_web/auth_middleware.py |  2 ++
 2 files changed, 33 insertions(+)

diff --git a/simplyblock_web/api/v1/__init__.py b/simplyblock_web/api/v1/__init__.py
index 4bcc5ba41..084a737cc 100644
--- a/simplyblock_web/api/v1/__init__.py
+++ b/simplyblock_web/api/v1/__init__.py
@@ -1,9 +1,12 @@
 import logging
+import os
 
+from flask import jsonify
 from flask import Flask
 
 from simplyblock_web.auth_middleware import token_required
 from simplyblock_web import utils
+from simplyblock_core import constants
 
 from . import cluster
 from . import mgmt_node
@@ -39,3 +42,31 @@ def before_request():
 @api.route('/', methods=['GET'])
 def status():
     return utils.get_response("Live")
+
+@api.route('/health/fdb', methods=['GET'])
+def health_fdb():
+    fdb_cluster_file = constants.KVD_DB_FILE_PATH
+
+    if not os.path.exists(fdb_cluster_file):
+        return jsonify({
+            "fdb_connected": False,
+            "message": "FDB cluster file not found"
+        }), 503
+
+    try:
+        with open(fdb_cluster_file, 'r') as f:
+            cluster_data = f.read().strip()
+            if not cluster_data:
+                return jsonify({
+                    "fdb_connected": False,
+                    "message": "FDB cluster file is empty"
+                }), 503
+    except Exception as e:
+        return jsonify({
+            "fdb_connected": False,
+            "message": f"Failed to read FDB cluster file: {str(e)}"
+        }), 503
+
+    return jsonify({
+        "fdb_connected": True,
+    }), 200
diff --git a/simplyblock_web/auth_middleware.py b/simplyblock_web/auth_middleware.py
index 70755b46a..87449cb64 100644
--- a/simplyblock_web/auth_middleware.py
+++ b/simplyblock_web/auth_middleware.py
@@ -36,6 +36,8 @@ def decorated(*args: Any, **kwargs: Any) -> ResponseType:
             return cast(ResponseType, f(*args, **kwargs))
         if request.method == "POST" and request.path.startswith("/cluster/create_first"):
             return cast(ResponseType, f(*args, **kwargs))
+        if request.method == "GET" and request.path.startswith("/health/fdb"):
+            return cast(ResponseType, f(*args, **kwargs))            
 
         cluster_id: str = ""
         cluster_secret: str = ""

From e344c9336d6c7f0ae233d72cb63868c2a2eded66 Mon Sep 17 00:00:00 2001
From: hamdykhader <hamdy.khader@gmail.com>
Date: Mon, 8 Dec 2025 16:02:58 +0300
Subject: [PATCH 53/68] Fix sfam-2515

check JM replication status on sec before dropping leadership during node restart and node down>online status change
---
 .../controllers/storage_events.py             | 12 ++++++++++
 simplyblock_core/models/storage_node.py       | 22 ++++++++++++++++++-
 .../services/tasks_runner_port_allow.py       |  8 +++++++
 simplyblock_core/storage_node_ops.py          |  6 +++++
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/simplyblock_core/controllers/storage_events.py b/simplyblock_core/controllers/storage_events.py
index b73890cd8..bd5a9eb8d 100644
--- a/simplyblock_core/controllers/storage_events.py
+++ b/simplyblock_core/controllers/storage_events.py
@@ -72,3 +72,15 @@ def snode_rpc_timeout(node, timeout_seconds, caused_by=ec.CAUSED_BY_MONITOR):
         event_level=EventObj.LEVEL_WARN,
         message=f"Storage node RPC timeout detected after {timeout_seconds} seconds",
         node_id=node.get_id())
+
+
+def jm_repl_tasks_found(node, jm_vuid, caused_by=ec.CAUSED_BY_MONITOR):
+    ec.log_event_cluster(
+        cluster_id=node.cluster_id,
+        domain=ec.DOMAIN_CLUSTER,
+        event=ec.EVENT_STATUS_CHANGE,
+        db_object=node,
+        caused_by=caused_by,
+        event_level=EventObj.LEVEL_WARN,
+        message=f"JM replication task found for jm {jm_vuid}",
+        node_id=node.get_id())
diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py
index 81639c556..45abceec9 100644
--- a/simplyblock_core/models/storage_node.py
+++ b/simplyblock_core/models/storage_node.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-
+import time
 from typing import List
 from uuid import uuid4
 
@@ -302,3 +302,23 @@ def create_alceml(self, name, nvme_bdev, uuid, **kwargs):
             alceml_worker_cpu_mask=alceml_worker_cpu_mask,
             **kwargs,
         )
+
+    def wait_for_jm_rep_tasks_to_finish(self, jm_vuid):
+        retry = 10
+        while retry > 0:
+            try:
+                jm_replication_tasks = False
+                ret = self.rpc_client().jc_get_jm_status(jm_vuid)
+                for jm in ret:
+                    if ret[jm] is False:  # jm is not ready (has active replication task)
+                        jm_replication_tasks = True
+                        break
+                if jm_replication_tasks:
+                    logger.warning(f"Replication task found on node: {self.get_id()}, jm_vuid: {jm_vuid}, retry...")
+                    retry -= 1
+                    time.sleep(20)
+                else:
+                    return True
+            except Exception:
+                logger.warning("Failed to get replication task!")
+        return False
diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py
index e95dbdf94..96ffc4664 100644
--- a/simplyblock_core/services/tasks_runner_port_allow.py
+++ b/simplyblock_core/services/tasks_runner_port_allow.py
@@ -206,6 +206,14 @@
 
                         if sec_node and sec_node.status == StorageNode.STATUS_ONLINE:
                             sec_rpc_client = sec_node.rpc_client()
+                            ret = sec_node.wait_for_jm_rep_tasks_to_finish(node.jm_vuid)
+                            if not ret:
+                                msg = "JM replication task found on secondary"
+                                logger.warning(msg)
+                                task.function_result = msg
+                                task.status = JobSchedule.STATUS_SUSPENDED
+                                task.write_to_db(db.kv_store)
+                                continue
                             sec_rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False, bs_nonleadership=True)
 
                         port_number = task.function_params["port_number"]
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index 9b6630680..ec4f8e514 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -3177,6 +3177,12 @@ def recreate_lvstore(snode, force=False):
 
             time.sleep(0.5)
             ### 4- set leadership to false
+            ret = sec_node.wait_for_jm_rep_tasks_to_finish(snode.jm_vuid)
+            if not ret:
+                msg = f"JM replication task found for jm {snode.jm_vuid}"
+                logger.error(msg)
+                storage_events.jm_repl_tasks_found(sec_node, snode.jm_vuid)
+
             sec_rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False, bs_nonleadership=True)
             sec_rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid)
             ### 4-1 check for inflight IO. retry every 100ms up to 10 seconds

From 6a9357ca7bda0009563e5a823fa29942d501371b Mon Sep 17 00:00:00 2001
From: hamdykhader <hamdy.khader@gmail.com>
Date: Wed, 10 Dec 2025 17:58:41 +0300
Subject: [PATCH 54/68] Fix sfam-2502

Fix the check for JM repl tasks to be before blocking the port
---
 simplyblock_core/storage_node_ops.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index ec4f8e514..ccfdbebe0 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -3172,17 +3172,18 @@ def recreate_lvstore(snode, force=False):
             port_type = "tcp"
             if sec_node.active_rdma:
                 port_type = "udp"
-            fw_api.firewall_set_port(snode.lvol_subsys_port, port_type, "block", sec_node.rpc_port)
-            tcp_ports_events.port_deny(sec_node, snode.lvol_subsys_port)
 
-            time.sleep(0.5)
-            ### 4- set leadership to false
             ret = sec_node.wait_for_jm_rep_tasks_to_finish(snode.jm_vuid)
             if not ret:
                 msg = f"JM replication task found for jm {snode.jm_vuid}"
                 logger.error(msg)
                 storage_events.jm_repl_tasks_found(sec_node, snode.jm_vuid)
 
+            fw_api.firewall_set_port(snode.lvol_subsys_port, port_type, "block", sec_node.rpc_port)
+            tcp_ports_events.port_deny(sec_node, snode.lvol_subsys_port)
+
+            time.sleep(0.5)
+            ### 4- set leadership to false
             sec_rpc_client.bdev_lvol_set_leader(snode.lvstore, leader=False, bs_nonleadership=True)
             sec_rpc_client.bdev_distrib_force_to_non_leader(snode.jm_vuid)
             ### 4-1 check for inflight IO. retry every 100ms up to 10 seconds

From bc6470475c8db8bfce58c571957615d0271f168f Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Fri, 12 Dec 2025 14:51:24 +0300
Subject: [PATCH 55/68] Add task API to v2 (#818)

* Add task API to v2

* Add task API to v2 _2

* Add task API to v2 _3

* Add task API to v2 _4
---
 simplyblock_web/api/v2/__init__.py | 3 ++-
 simplyblock_web/api/v2/task.py     | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/simplyblock_web/api/v2/__init__.py b/simplyblock_web/api/v2/__init__.py
index ff8511e1c..c3723cce6 100644
--- a/simplyblock_web/api/v2/__init__.py
+++ b/simplyblock_web/api/v2/__init__.py
@@ -10,6 +10,7 @@
 from . import pool
 from . import snapshot
 from . import storage_node
+from . import task
 
 from simplyblock_core.db_controller import DBController
 
@@ -37,7 +38,7 @@ def _verify_api_token(
 storage_node.api.include_router(storage_node.instance_api)
 
 cluster.instance_api.include_router(storage_node.api)
-
+cluster.instance_api.include_router(task.api)
 
 volume.api.include_router(volume.instance_api)
 pool.instance_api.include_router(volume.api)
diff --git a/simplyblock_web/api/v2/task.py b/simplyblock_web/api/v2/task.py
index c17bec3b7..83890640f 100644
--- a/simplyblock_web/api/v2/task.py
+++ b/simplyblock_web/api/v2/task.py
@@ -40,3 +40,5 @@ def _lookup_task(task_id: UUID) -> JobSchedule:
 @instance_api.get('/', name='clusters:tasks:detail')
 def get(cluster: Cluster, task: Task) -> TaskDTO:
     return TaskDTO.from_model(task)
+
+api.include_router(instance_api)

From b1599511d93d21f76b006d784e51b6a6c9c71a48 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Mon, 15 Dec 2025 13:59:44 +0300
Subject: [PATCH 56/68] Show number of devices on storage node response (#819)

show used number of cpus beaed on the spdk_mask
---
 simplyblock_core/storage_node_ops.py | 3 +--
 simplyblock_web/api/v2/dtos.py       | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index ccfdbebe0..582fe918d 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -1108,8 +1108,6 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list,
         snode.active_tcp=active_tcp
         snode.active_rdma=active_rdma
 
-        if 'cpu_count' in node_info:
-            snode.cpu = node_info['cpu_count']
         if 'cpu_hz' in node_info:
             snode.cpu_hz = node_info['cpu_hz']
         if 'memory' in node_info:
@@ -1117,6 +1115,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list,
         if 'hugepages' in node_info:
             snode.hugepages = node_info['hugepages']
 
+        snode.cpu = len(utils.hexa_to_cpu_list(spdk_cpu_mask))
         snode.l_cores = l_cores or ""
         snode.spdk_cpu_mask = spdk_cpu_mask or ""
         snode.spdk_mem = minimum_hp_memory
diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py
index f8ba77ae7..c44333d29 100644
--- a/simplyblock_web/api/v2/dtos.py
+++ b/simplyblock_web/api/v2/dtos.py
@@ -155,6 +155,7 @@ class StorageNodeDTO(BaseModel):
     status: str
     mgmt_ip: IPv4Address
     health_check: bool
+    online_devices: str
 
     @staticmethod
     def from_model(model: StorageNode):
@@ -163,6 +164,7 @@ def from_model(model: StorageNode):
             status=model.status,
             mgmt_ip=IPv4Address(model.mgmt_ip),
             health_check=model.health_check,
+            online_devices=f"{len(model.nvme_devices)}/{len([d for d in model.nvme_devices if d.status=='online'])}",
         )
 
 

From 8e10e29237b33cc5ef19354b2bf7d11026cbb494 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Mon, 15 Dec 2025 22:53:35 +0300
Subject: [PATCH 57/68] Return capacity usage per object in API V2 (cluster,
 node, device, pool, lvol) (#820)

* Return capacity usage per node object in API V2

* Fix type checker

* Adds capacity re reponse API v2 to:
cluster, node, device, pool, lvol
---
 simplyblock_web/api/v2/cluster.py      | 20 +++++++-----
 simplyblock_web/api/v2/device.py       | 18 ++++++++---
 simplyblock_web/api/v2/dtos.py         | 42 +++++++++++++++++++++++---
 simplyblock_web/api/v2/pool.py         | 21 ++++++++-----
 simplyblock_web/api/v2/storage_node.py | 19 ++++++++----
 simplyblock_web/api/v2/volume.py       | 19 ++++++++----
 6 files changed, 103 insertions(+), 36 deletions(-)

diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py
index 7834e3f06..19e9dbbf4 100644
--- a/simplyblock_web/api/v2/cluster.py
+++ b/simplyblock_web/api/v2/cluster.py
@@ -48,12 +48,14 @@ class ClusterParams(BaseModel):
 
 @api.get('/', name='clusters:list')
 def list() -> List[ClusterDTO]:
-    return [
-        ClusterDTO.from_model(cluster)
-        for cluster
-        in db.get_clusters()
-    ]
-
+    data = []
+    for cluster in db.get_clusters():
+        stat_obj = None
+        ret = db.get_cluster_capacity(cluster, 1)
+        if ret:
+            stat_obj = ret[0]
+        data.append(ClusterDTO.from_model(cluster, stat_obj))
+    return data
 
 @api.post('/', name='clusters:create', status_code=201, responses={201: {"content": None}})
 def add(request: Request, parameters: ClusterParams):
@@ -80,7 +82,11 @@ def _lookup_cluster(cluster_id: UUID):
 
 @instance_api.get('/', name='clusters:detail')
 def get(cluster: Cluster) -> ClusterDTO:
-    return ClusterDTO.from_model(cluster)
+    stat_obj = None
+    ret = db.get_cluster_capacity(cluster, 1)
+    if ret:
+        stat_obj = ret[0]
+    return ClusterDTO.from_model(cluster, stat_obj)
 
 
 class UpdatableClusterParameters(BaseModel):
diff --git a/simplyblock_web/api/v2/device.py b/simplyblock_web/api/v2/device.py
index 1c7b40d7e..4fa0949fb 100644
--- a/simplyblock_web/api/v2/device.py
+++ b/simplyblock_web/api/v2/device.py
@@ -18,10 +18,14 @@
 
 @api.get('/', name='clusters:storage_nodes:devices:list')
 def list(cluster: Cluster, storage_node: StorageNode) -> List[DeviceDTO]:
-    return [
-        DeviceDTO.from_model(device)
-        for device in storage_node.nvme_devices
-    ]
+    data = []
+    for device in storage_node.nvme_devices:
+        stat_obj = None
+        ret = db.get_device_stats(device, 1)
+        if ret:
+            stat_obj = ret[0]
+        data.append(DeviceDTO.from_model(device, stat_obj))
+    return data
 
 instance_api = APIRouter(prefix='/{device_id}')
 
@@ -38,7 +42,11 @@ def _lookup_device(storage_node: StorageNode, device_id: UUID) -> NVMeDevice:
 
 @instance_api.get('/', name='clusters:storage_nodes:devices:detail')
 def get(cluster: Cluster, storage_node: StorageNode, device: Device) -> DeviceDTO:
-    return DeviceDTO.from_model(device)
+    stat_obj = None
+    ret = db.get_device_stats(device, 1)
+    if ret:
+        stat_obj = ret[0]
+    return DeviceDTO.from_model(device, stat_obj)
 
 
 @instance_api.delete('/', name='clusters:storage_nodes:devices:delete', status_code=204, responses={204: {"content": None}})
diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py
index c44333d29..62f1a94e1 100644
--- a/simplyblock_web/api/v2/dtos.py
+++ b/simplyblock_web/api/v2/dtos.py
@@ -12,11 +12,33 @@
 from simplyblock_core.models.nvme_device import NVMeDevice
 from simplyblock_core.models.pool import Pool
 from simplyblock_core.models.snapshot import SnapShot
+from simplyblock_core.models.stats import StatsObject
 from simplyblock_core.models.storage_node import StorageNode
 
 from . import util
 
 
+class CapacityStatDTO(BaseModel):
+    date: int
+    size_total: int
+    size_prov: int
+    size_used: int
+    size_free: int
+    size_util: int
+
+    @staticmethod
+    def from_model(model: StatsObject):
+        return CapacityStatDTO(
+            date=model.date,
+            size_total=model.size_total,
+            size_prov=model.size_prov,
+            size_used=model.size_used,
+            size_free=model.size_free,
+            size_util=model.size_util,
+        )
+
+
+
 class ClusterDTO(BaseModel):
     id: UUID
     name: Optional[str]
@@ -33,9 +55,10 @@ class ClusterDTO(BaseModel):
     node_affinity: bool
     anti_affinity: bool
     secret: str
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: Cluster):
+    def from_model(model: Cluster, stat_obj: Optional[StatsObject]=None):
         return ClusterDTO(
             id=UUID(model.get_id()),
             name=model.cluster_name,
@@ -52,6 +75,7 @@ def from_model(model: Cluster):
             node_affinity=model.enable_node_affinity,
             anti_affinity=model.strict_node_anti_affinity,
             secret=model.secret,
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
 
 
@@ -65,9 +89,10 @@ class DeviceDTO(BaseModel):
     nvmf_ips: List[IPv4Address]
     nvmf_nqn: str = ""
     nvmf_port: int = 0
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: NVMeDevice):
+    def from_model(model: NVMeDevice, stat_obj: Optional[StatsObject]=None):
         return DeviceDTO(
             id=UUID(model.get_id()),
             status=model.status,
@@ -78,6 +103,7 @@ def from_model(model: NVMeDevice):
             nvmf_ips=[IPv4Address(ip) for ip in model.nvmf_ip.split(',')],
             nvmf_nqn=model.nvmf_nqn,
             nvmf_port=model.nvmf_port,
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
 
 
@@ -107,9 +133,10 @@ class StoragePoolDTO(BaseModel):
     max_rw_mbytes: util.Unsigned
     max_r_mbytes: util.Unsigned
     max_w_mbytes: util.Unsigned
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: Pool):
+    def from_model(model: Pool, stat_obj: Optional[StatsObject]=None):
         return StoragePoolDTO(
             id=UUID(model.get_id()),
             name=model.pool_name,
@@ -120,6 +147,7 @@ def from_model(model: Pool):
             max_rw_mbytes=model.max_rw_mbytes_per_sec,
             max_r_mbytes=model.max_r_mbytes_per_sec,
             max_w_mbytes=model.max_w_mbytes_per_sec,
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
 
 
@@ -156,15 +184,17 @@ class StorageNodeDTO(BaseModel):
     mgmt_ip: IPv4Address
     health_check: bool
     online_devices: str
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: StorageNode):
+    def from_model(model: StorageNode, stat_obj: Optional[StatsObject]=None):
         return StorageNodeDTO(
             uuid=UUID(model.get_id()),
             status=model.status,
             mgmt_ip=IPv4Address(model.mgmt_ip),
             health_check=model.health_check,
             online_devices=f"{len(model.nvme_devices)}/{len([d for d in model.nvme_devices if d.status=='online'])}",
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
 
 
@@ -208,9 +238,10 @@ class VolumeDTO(BaseModel):
     max_rw_mbytes: util.Unsigned
     max_r_mbytes: util.Unsigned
     max_w_mbytes: util.Unsigned
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: LVol, request: Request, cluster_id: str):
+    def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optional[StatsObject]=None):
         return VolumeDTO(
             id=UUID(model.get_id()),
             name=model.lvol_name,
@@ -243,4 +274,5 @@ def from_model(model: LVol, request: Request, cluster_id: str):
             max_rw_mbytes=model.rw_mbytes_per_sec,
             max_r_mbytes=model.r_mbytes_per_sec,
             max_w_mbytes=model.w_mbytes_per_sec,
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py
index 4ef2c897b..4ccae01ab 100644
--- a/simplyblock_web/api/v2/pool.py
+++ b/simplyblock_web/api/v2/pool.py
@@ -20,12 +20,15 @@
 
 @api.get('/', name='clusters:storage-pools:list')
 def list(cluster: Cluster) -> List[StoragePoolDTO]:
-    return [
-        StoragePoolDTO.from_model(pool)
-        for pool
-        in db.get_pools()
-        if pool.cluster_id == cluster.get_id()
-    ]
+    data = []
+    for pool in db.get_pools():
+        if pool.cluster_id == cluster.get_id():
+            stat_obj = None
+            ret = db.get_pool_stats(pool, 1)
+            if ret:
+                stat_obj = ret[0]
+            data.append(StoragePoolDTO.from_model(pool, stat_obj))
+    return data
 
 
 class StoragePoolParams(BaseModel):
@@ -73,7 +76,11 @@ def _lookup_storage_pool(pool_id: UUID) -> PoolModel:
 
 @instance_api.get('/', name='clusters:storage-pools:detail')
 def get(cluster: Cluster, pool: StoragePool) -> StoragePoolDTO:
-    return StoragePoolDTO.from_model(pool)
+    stat_obj = None
+    ret = db.get_pool_stats(pool, 1)
+    if ret:
+        stat_obj = ret[0]
+    return StoragePoolDTO.from_model(pool, stat_obj)
 
 
 @instance_api.delete('/', name='clusters:storage-pools:delete', status_code=204, responses={204: {"content": None}})
diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py
index e612d7177..aa7923d36 100644
--- a/simplyblock_web/api/v2/storage_node.py
+++ b/simplyblock_web/api/v2/storage_node.py
@@ -22,11 +22,14 @@
 
 @api.get('/', name='clusters:storage-nodes:list')
 def list(cluster: Cluster) -> List[StorageNodeDTO]:
-    return [
-        StorageNodeDTO.from_model(storage_node)
-        for storage_node
-        in db.get_storage_nodes_by_cluster_id(cluster.get_id())
-    ]
+    data = []
+    for storage_node in db.get_storage_nodes_by_cluster_id(cluster.get_id()):
+        node_stat_obj = None
+        ret = db.get_node_capacity(storage_node, 1)
+        if ret:
+            node_stat_obj = ret[0]
+        data.append(StorageNodeDTO.from_model(storage_node, node_stat_obj))
+    return data
 
 
 class StorageNodeParams(BaseModel):
@@ -86,7 +89,11 @@ def _lookup_storage_node(storage_node_id: UUID) -> StorageNodeModel:
 
 @instance_api.get('/', name='clusters:storage-nodes:detail')
 def get(cluster: Cluster, storage_node: StorageNode):
-    return StorageNodeDTO.from_model(storage_node)
+    node_stat_obj = None
+    ret = db.get_node_capacity(storage_node, 1)
+    if ret:
+        node_stat_obj = ret[0]
+    return StorageNodeDTO.from_model(storage_node, node_stat_obj)
 
 
 @instance_api.delete('/', name='clusters:storage-nodes:delete')
diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py
index 698788718..6755a1149 100644
--- a/simplyblock_web/api/v2/volume.py
+++ b/simplyblock_web/api/v2/volume.py
@@ -21,11 +21,14 @@
 
 @api.get('/', name='clusters:storage-pools:volumes:list')
 def list(request: Request, cluster: Cluster, pool: StoragePool) -> List[VolumeDTO]:
-    return [
-        VolumeDTO.from_model(lvol, request, cluster.get_id())
-        for lvol
-        in db.get_lvols_by_pool_id(pool.get_id())
-    ]
+    data = []
+    for lvol in db.get_lvols_by_pool_id(pool.get_id()):
+        stat_obj = None
+        ret = db.get_lvol_stats(lvol, 1)
+        if ret:
+            stat_obj = ret[0]
+        data.append(VolumeDTO.from_model(lvol, request, cluster.get_id(), stat_obj))
+    return data
 
 
 class _CreateParams(BaseModel):
@@ -122,7 +125,11 @@ def _lookup_volume(volume_id: UUID) -> LVol:
 
 @instance_api.get('/', name='clusters:storage-pools:volumes:detail')
 def get(request: Request, cluster: Cluster, pool: StoragePool, volume: Volume) -> VolumeDTO:
-    return VolumeDTO.from_model(volume, request, cluster.get_id())
+    stat_obj = None
+    ret = db.get_lvol_stats(volume, 1)
+    if ret:
+        stat_obj = ret[0]
+    return VolumeDTO.from_model(volume, request, cluster.get_id(), stat_obj)
 
 
 class UpdatableLVolParams(BaseModel):

From 8dcc73cc38435d53371885339113e25fdb34940e Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Thu, 18 Dec 2025 15:21:03 +0300
Subject: [PATCH 58/68] Adds cluster rebalancing event (#823)

---
 simplyblock_core/controllers/cluster_events.py    | 10 ++++++++++
 simplyblock_core/services/storage_node_monitor.py | 12 ++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/simplyblock_core/controllers/cluster_events.py b/simplyblock_core/controllers/cluster_events.py
index e8e6c406e..059aea976 100644
--- a/simplyblock_core/controllers/cluster_events.py
+++ b/simplyblock_core/controllers/cluster_events.py
@@ -80,3 +80,13 @@ def cluster_delete(cluster):
         db_object=cluster,
         caused_by=ec.CAUSED_BY_CLI,
         message=f"Cluster deleted {cluster.get_id()}")
+
+
+def cluster_rebalancing_change(cluster, new_state, old_status):
+    ec.log_event_cluster(
+        cluster_id=cluster.get_id(),
+        domain=ec.DOMAIN_CLUSTER,
+        event=ec.EVENT_STATUS_CHANGE,
+        db_object=cluster,
+        caused_by=ec.CAUSED_BY_CLI,
+        message=f"Cluster rebalancing changed from {old_status} to {new_state}")
diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py
index bfb92c11b..e7f32ad82 100644
--- a/simplyblock_core/services/storage_node_monitor.py
+++ b/simplyblock_core/services/storage_node_monitor.py
@@ -5,7 +5,8 @@
 
 
 from simplyblock_core import constants, db_controller, cluster_ops, storage_node_ops, utils
-from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events
+from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events, \
+    cluster_events
 from simplyblock_core.models.cluster import Cluster
 from simplyblock_core.models.job_schedule import JobSchedule
 from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
@@ -134,10 +135,13 @@ def update_cluster_status(cluster_id):
             JobSchedule.FN_DEV_MIG, JobSchedule.FN_NEW_DEV_MIG, JobSchedule.FN_FAILED_DEV_MIG]:
             if task.retry == 0:
                 first_iter_task_pending += 1
-
+    is_re_balancing = first_iter_task_pending  > 0
     cluster = db.get_cluster_by_id(cluster_id)
-    cluster.is_re_balancing = first_iter_task_pending  > 0
-    cluster.write_to_db()
+    if cluster.is_re_balancing != is_re_balancing:
+        old_status = cluster.is_re_balancing
+        cluster.is_re_balancing = is_re_balancing
+        cluster.write_to_db()
+        cluster_events.cluster_rebalancing_change(cluster_id, cluster.is_re_balancing, old_status)
 
     current_cluster_status = cluster.status
     logger.info("cluster_status: %s", current_cluster_status)

From 14e3c06f619640130f97995650587da55bc16d7e Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Fri, 19 Dec 2025 20:12:19 +0300
Subject: [PATCH 59/68] Add snode port change event (#824)

* Add snode port change event

* fix linter
---
 e2e/continuous_log_collector.py                      |  1 -
 e2e/e2e_tests/cluster_test_base.py                   |  2 +-
 .../continuous_failover_ha_multi_client.py           |  4 ++--
 ...ntinuous_failover_ha_multi_client_quick_outage.py |  3 +--
 e2e/utils/ssh_utils.py                               |  3 ++-
 simplyblock_core/controllers/storage_events.py       | 12 ++++++++++++
 simplyblock_core/storage_node_ops.py                 |  1 +
 7 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/e2e/continuous_log_collector.py b/e2e/continuous_log_collector.py
index 96b157760..d1ea68c38 100644
--- a/e2e/continuous_log_collector.py
+++ b/e2e/continuous_log_collector.py
@@ -1,6 +1,5 @@
 import os
 from datetime import datetime
-from pathlib import Path
 from utils.ssh_utils import SshUtils, RunnerK8sLog
 from logger_config import setup_logger
 
diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py
index 15743725b..d37222c88 100644
--- a/e2e/e2e_tests/cluster_test_base.py
+++ b/e2e/e2e_tests/cluster_test_base.py
@@ -405,7 +405,7 @@ def collect_management_details(self, post_teardown=False):
             self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd)
 
             node+=1
-        all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines:
+        all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines
         for node in all_nodes:
             base_path = os.path.join(self.docker_logs_path, node)
             cmd = f"journalctl -k --no-tail >& {base_path}/jounalctl_{node}-final.txt"
diff --git a/e2e/stress_test/continuous_failover_ha_multi_client.py b/e2e/stress_test/continuous_failover_ha_multi_client.py
index a97c42676..0f0c9f94e 100644
--- a/e2e/stress_test/continuous_failover_ha_multi_client.py
+++ b/e2e/stress_test/continuous_failover_ha_multi_client.py
@@ -329,7 +329,7 @@ def perform_random_outage(self):
         for node in self.sn_nodes_with_sec:
             # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
             #                          storage_node_id=node)
-            self.logger.info(f"Skipping lvstore dump!!")
+            self.logger.info("Skipping lvstore dump!!")
         for node in self.sn_nodes_with_sec:
             cur_node_details = self.sbcli_utils.get_storage_node_details(node)
             cur_node_ip = cur_node_details[0]["mgmt_ip"]
@@ -663,7 +663,7 @@ def restart_nodes_after_failover(self, outage_type, restart=False):
         for node in self.sn_nodes_with_sec:
             # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
             #                          storage_node_id=node)
-            self.logger.info(f"Skipping lvstore dump!!")
+            self.logger.info("Skipping lvstore dump!!")
 
     def create_snapshots_and_clones(self):
         """Create snapshots and clones during an outage."""
diff --git a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py
index afa98b055..c2c1051a2 100644
--- a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py
+++ b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py
@@ -306,7 +306,7 @@ def _seed_snapshots_and_clones(self):
                 if err:
                     nqn = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])[0]["nqn"]
                     self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn)
-                    self.logger.info(f"[LFNG] connect clone error → cleanup")
+                    self.logger.info("[LFNG] connect clone error → cleanup")
                     self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True)
                     sleep_n_sec(3)
                     del self.clone_mount_details[clone_name]
@@ -431,7 +431,6 @@ def _perform_outage(self):
         return outage_type
 
     def restart_nodes_after_failover(self, outage_type):
-        node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
 
         self.logger.info(f"[LFNG] Recover outage={outage_type} node={self.current_outage_node}")
 
diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py
index ee265d507..a50a61726 100644
--- a/e2e/utils/ssh_utils.py
+++ b/e2e/utils/ssh_utils.py
@@ -2891,7 +2891,8 @@ def stop_log_monitor(self):
             print("K8s log monitor thread stopped.")
 
 def _rid(n=6):
-    import string, random
+    import string
+    import random
     letters = string.ascii_uppercase
     digits = string.digits
     return random.choice(letters) + ''.join(random.choices(letters + digits, k=n-1))
diff --git a/simplyblock_core/controllers/storage_events.py b/simplyblock_core/controllers/storage_events.py
index bd5a9eb8d..027f7dbed 100644
--- a/simplyblock_core/controllers/storage_events.py
+++ b/simplyblock_core/controllers/storage_events.py
@@ -84,3 +84,15 @@ def jm_repl_tasks_found(node, jm_vuid, caused_by=ec.CAUSED_BY_MONITOR):
         event_level=EventObj.LEVEL_WARN,
         message=f"JM replication task found for jm {jm_vuid}",
         node_id=node.get_id())
+
+
+def node_ports_changed(node, caused_by=ec.CAUSED_BY_MONITOR):
+    ec.log_event_cluster(
+        cluster_id=node.cluster_id,
+        domain=ec.DOMAIN_CLUSTER,
+        event=ec.EVENT_STATUS_CHANGE,
+        db_object=node,
+        caused_by=caused_by,
+        event_level=EventObj.LEVEL_WARN,
+        message=f"Storage node ports set, LVol:{node.lvol_subsys_port} RPC:{node.rpc_port} Internal:{node.nvmf_port}",
+        node_id=node.get_id())
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index 582fe918d..daa99d31a 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -3600,6 +3600,7 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo
 
         sec_node.write_to_db()
 
+    storage_events.node_ports_changed(snode)
     return True
 
 

From 1a7d4f48b4238c55fec63d0f6a8e027a04b77245 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Fri, 19 Dec 2025 20:58:24 +0300
Subject: [PATCH 60/68] Fix snode health check cluster logs (#825)

---
 simplyblock_core/storage_node_ops.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index daa99d31a..a6d89b74d 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -1907,7 +1907,6 @@ def restart_storage_node(
         return False
     if snode.enable_ha_jm:
         snode.remote_jm_devices = _connect_to_remote_jm_devs(snode)
-    snode.health_check = True
     snode.lvstore_status = ""
     snode.write_to_db(db_controller.kv_store)
 
@@ -2968,7 +2967,6 @@ def set_node_status(node_id, status, reconnect_on_online=True):
             return False
         if snode.enable_ha_jm:
             snode.remote_jm_devices = _connect_to_remote_jm_devs(snode)
-        snode.health_check = True
         snode.write_to_db(db_controller.kv_store)
         distr_controller.send_cluster_map_to_node(snode)
 

From 8d9d0a5e45d78f1b8dd9c1a29c2a353aeec076fd Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Mon, 22 Dec 2025 18:10:35 +0300
Subject: [PATCH 61/68] Fix prom client cluster ip in case of k8s (#826)

* Fix prom client cluster ip in case of k8s

* fix
---
 simplyblock_core/prom_client.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/simplyblock_core/prom_client.py b/simplyblock_core/prom_client.py
index 82756161b..833d42b36 100644
--- a/simplyblock_core/prom_client.py
+++ b/simplyblock_core/prom_client.py
@@ -2,6 +2,7 @@
 import re
 from datetime import datetime, timedelta
 
+from simplyblock_core import constants
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.models.mgmt_node import MgmtNode
 
@@ -20,13 +21,16 @@ class PromClient:
     def __init__(self, cluster_id):
         db_controller = DBController()
         cluster_ip = None
-        for node in db_controller.get_mgmt_nodes():
-            if node.cluster_id == cluster_id and node.status == MgmtNode.STATUS_ONLINE:
-                cluster_ip = node.mgmt_ip
-                break
-        if cluster_ip is None:
-            raise PromClientException("Cluster has no online mgmt nodes")
-
+        cluster = db_controller.get_cluster_by_id(cluster_id)
+        if cluster.mode == "docker":
+            for node in db_controller.get_mgmt_nodes():
+                if node.cluster_id == cluster_id and node.status == MgmtNode.STATUS_ONLINE:
+                    cluster_ip = node.mgmt_ip
+                    break
+            if cluster_ip is None:
+                raise PromClientException("Cluster has no online mgmt nodes")
+        else:
+            cluster_ip = constants.PROMETHEUS_STATEFULSET_NAME
         self.ip_address = f"{cluster_ip}:9090"
         self.url = 'http://%s/' % self.ip_address
         self.client = PrometheusConnect(url=self.url, disable_ssl=True)

From 1a35e2d0abcdded6d18e9cb63828ef87c0fd51d7 Mon Sep 17 00:00:00 2001
From: schmidt-scaled <schmidt@scaled.cloud>
Date: Tue, 23 Dec 2025 21:16:45 +0300
Subject: [PATCH 62/68] set size of lvstore cluster in constants (as ratio to
 distrib page size)

---
 .../controllers/lvol_migration_controller.py  | 582 ++++++++++--------
 simplyblock_core/models/lvol_migration.py     |  35 +-
 simplyblock_core/models/lvol_model.py         |  41 +-
 simplyblock_core/models/snapshot.py           |  34 +-
 simplyblock_core/rpc_client.py                |  84 ++-
 simplyblock_core/storage_node_ops.py          |   2 +-
 6 files changed, 477 insertions(+), 301 deletions(-)

diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
index 5aefd49d0..cc0c100d3 100644
--- a/simplyblock_core/controllers/lvol_migration_controller.py
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -7,7 +7,7 @@
 from ..cluster_ops import db_controller
 from ..models.lvol_migration import *
 from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Dict
 from simplyblock_core.storage_node_ops import *
 from simplyblock_core.db_controller import *
 from simplyblock_core.models.lvol_model import LVol
@@ -19,9 +19,16 @@
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 import uuid
+import copy
 
 
 
+#TODOS: Integrate in Task Mgmt
+#       Asynchronous delete of objects must check results before sync delete and cleanup is ready
+#       must reconnect rpc clients after node restart
+#       double-check all object states
+#       we must convert with previous
+
 # ---------------------------------------------------------------------------
 # Migration Service
 # ---------------------------------------------------------------------------
@@ -74,103 +81,6 @@ class MigrationService:
 # Migration Controller
 # ---------------------------------------------------------------------------
 
-
-def get_lvol_by_name(lvol_name):
-    return LVol
-
-
-def snap_assign(lvol: LogicalVolumeRef, snap: SnapShot, target_lvs: str):
-    s = Snapshot()
-    s.retry = 0
-    s.status = ObjectMigrationState.NEW
-    s.bdev_name = snap.snap_bdev.split("/", 1)[1]
-    s.lvs_name = lvol.lvs_name
-    s.lvol_size = snap.size
-    s.target_lvs_name = lvol.target_lvs_name
-    s.target_lvs_name = target_lvs
-    s.uuid = snap.uuid
-    s.source_uuid = snap.snap_uuid
-    return s
-
-
-def snap_init(uuid: str, lvol: LogicalVolumeRef, target_lvs: str):
-    s = Snapshot()
-    s.retry = 0
-    s.status = ObjectMigrationState.NEW
-    s.bdev_name = "MIG_SNAP"
-    s.lvs_name = lvol.lvs_name
-    s.lvol_size = lvol.size
-    s.target_lvs_name = lvol.target_lvs_name
-    s.target_lvs_name = target_lvs
-    s.uuid = uuid
-    s.source_uuid = uuid
-    return s
-
-
-def check_nodes_online(n1: StorageNode, n2: StorageNode, n3: StorageNode, n4: StorageNode):
-    if (n1.status == StorageNode.STATUS_ONLINE and
-            n2.status == StorageNode.STATUS_ONLINE and
-            n3.status == StorageNode.STATUS_ONLINE and
-            n4.status == StorageNode.STATUS_ONLINE):
-        return True
-    return False
-
-
-def delete_hub_lvol_controller():
-    return -1
-
-
-def lvol_assign(lvol:LVol, target_lvs: str):
-    m=MigrationObject()
-    m.main_logical_volume.state = ObjectMigrationState.NEW
-
-    #unique identifier:
-    m.main_logical_volume.retry=0
-    m.main_logical_volume.uuid = lvol.uuid
-    m.main_logical_volume.bdev_name = lvol.lvol_bdev
-    m.main_logical_volume.lvs_name = lvol.lvs_name
-    m.main_logical_volume.target_lvs_name = target_lvs
-    m.main_logical_volume.nqn = lvol.nqn
-    m.main_logical_volume.source_uuid = lvol.lvol_uuid
-    m.main_logical_volume.node_id = lvol.hostname
-    if lvol.crypto_bdev != "":
-       m.main_logical_volume.crypto_bdev_name = lvol.crypto_bdev
-    m.main_logical_volume.mapid = 0
-    m.main_logical_volume.size = lvol.size
-    m.main_logical_volume.ndcs = lvol.ndcs
-    m.main_logical_volume.npcs = lvol.npcs
-    m.main_logical_volume.priority_class = lvol.lvol_priority_class
-    m.main_logical_volume.namespace_id = lvol.namespace
-    m.main_logical_volume.cloned = lvol.cloned_from_snap
-    return m.main_logical_volume
-
-
-def get_transfer_state(lvolname: str, node_id: str):
-    offset=0
-    return -1,offset
-
-
-def create_snapshot(lvol: LogicalVolumeRef):
-        return -1,""
-
-
-def migrations_list():
-    db_controller = DBController()
-    migrations = db_controller.get_migrations()
-    data = []
-    for m in migrations:
-        logger.debug(m)
-        data.append({
-            "UUID": m.uuid,
-            "Lvol UUID": m.main_logical_volume.uuid,
-            "Primary (source):": m.node_pri,
-            "Primary (target):": m.target_node_pri,
-            "DateTime:": m.create_dt,
-            "Status": m.status,
-        })
-    return utils.print_table(data)
-
-
 class MigrationController:
     """Controller orchestrates LVOL migrations."""
 
@@ -182,145 +92,327 @@ def __init__(self):
         self.db_controller = DBController()
         self.prev_time = datetime.now()
 
-    def connect_client(self, node:StorageNode):
-        return RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1)
-
+    #connect clients for both primary (source) and secondary (target) nodes
     def connect_clients(self):
+      try:
         self.m.rpc_client1 = self.connect_client(self.m.node_pri)
         self.m.rpc_client2 = self.connect_client(self.m.node_sec)
         self.m.rpc_client3 = self.connect_client(self.m.target_node_pri)
         self.m.rpc_client4 = self.connect_client(self.m.target_node_sec)
-        return
+      except:
+        raise f"migration {self.m.uuid}: cannot create rpc client for all nodes. all nodes online?"
+      return
+
+    def get_rpc_client(self, node: StorageNode):
+        if node.uuid == self.m.node_pri.uuid:
+            client = self.m.rpc_client1
+        elif node.uuid == self.m.target_node_pri.uuid:
+            client = self.m.rpc_client3
+        elif node.uuid == self.m.node_sec.uuid:
+            client = self.m.rpc_client2
+        elif node.uuid == self.m.target_node_sec.uuid:
+            client = self.m.rpc_client4
+        else:
+            raise RuntimeError(f"migration {self.m.uuid}: invalid node {node.uuid}, stopping. ")
+        if not client or node.status != StorageNode.STATUS_ONLINE:
+            raise RuntimeError(f"migration {self.m.uuid}: node {node.uuid} not online, stopping. ")
+        return client
+
+    def snap_assign(self, lvol: LogicalVolumeRef, snap: SnapShot):
+        s = Snapshot()
+        s.lvol=lvol
+        s.snap=snap
+        return s
+
+    def lvol_assign(self, lvol: LVol):
+        m = LogicalVolumeRef()
+        m.lvol = lvol
+        return m
+
+    def check_nodes_online(self):
+        if self.m.node_pri.status == StorageNode.STATUS_ONLINE and self.m.node_sec.status == StorageNode.STATUS_ONLINE and self.m.target_node_pri.status == StorageNode.STATUS_ONLINE and self.m.target_node_sec.status == StorageNode.STATUS_ONLINE:
+               return True
+        return False
+
+    def raise_exception_on_error(self, ret: dict, err_str: str):
+        error="object not found"
+        if not ret or "error" in ret:
+            if ret:
+                error = f"{ret['error']['message']}:{ret['error']['code']}"
+            raise RuntimeError(
+                f"migration {self.m.uuid}:" + err_str + f": {error}")
+        return True
+
+    def get_transfer_state(self, node: StorageNode, counter: int):
+        client = self.get_rpc_client(node)
+        for m in self.m.completion_poll_queue:
+            if m.status==ObjectMigrationState.TRANSFER:
+              try:
+                 name=m.snap.lvol.lvs_name+"/"+m.snap.snap_bdev
+                 ret = client.bdev_lvol_transfer_stat(name)
+                 self.raise_exception_on_error(ret, f"could not get transfer state for lvol: {name}")
+                 if ret["transfer_state"]=="Done":
+                        m.status=ObjectMigrationState.TRANSFERRED
+                        self.m.write_to_db(db_controller.kv_store)
+                        self.m.completion_poll_queue.remove(m)
+                        return True, 0
+                 else:
+                     return False, ret["offset"]
+              except:
+                  logger.error(f"could not get transfer state for lvol")
+                  return False, 0
+        return False, 0
+
+    def create_snapshot(self, node: StorageNode, index: int):
+        client = self.get_rpc_client(node)
+        ret=client.lvol_exists(node.lvstore,"mig_snap_"+str(index)+"_"+self.m.vol.lvol.lvol_name)
+        if not ret or "error" in ret:
+            ret=client.lvol_create_snapshot(self.m.vol.lvol.lvol_uuid, "mig_snap_"+str(index)+"_"+self.m.vol.lvol.lvol_name)
+            self.raise_exception_on_error(ret, f"could not create snapshot for lvol: {self.m.vol.lvol.uuid}")
+        for sn in self.m.snapshots:
+            if sn.snap.uuid==ret["result"]:
+                return True
+        s=self.snap_assign(self.m.vol,ret["result"])
+        self.m.snapshots.append(s)
+        return True
+
+    def migrations_list(self):
+        self.db_controller = DBController()
+        migrations = db_controller.get_migrations()
+        data = []
+        for m in migrations:
+            logger.debug(m)
+            data.append({
+                "UUID": m.uuid,
+                "Lvol UUID": m.vol.lvol.uuid,
+                "Primary (source):": m.node_pri,
+                "Primary (target):": m.target_node_pri,
+                "DateTime:": m.create_dt,
+                "Status": m.status,
+            })
+        return utils.print_table(data)
+
+    @staticmethod
+    def connect_client(node:StorageNode):
+        return RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1)
 
     def unfreeze_objects(self):
-        db_controller = DBController()
-        l = db_controller.get_lvol_by_id(self.m.main_logical_volume.uuid)
+        self.db_controller = DBController()
+        l = db_controller.get_lvol_by_id(self.m.vol.lvol.uuid)
         l.frozen = False
         l.write_to_db(db_controller.kv_store)
-        snaps = db_controller.get_snapshots_by_node_id(self.m.node_pri.uuid)
+        snaps = db_controller.get_snapshots_by_node_id(self.m.node_pri)
         for s in snaps:
             s.frozen = False
             s.write_to_db(db_controller.kv_store)
-        return
+        snaps = db_controller.get_snapshots_by_node_id(self.m.node_sec)
+        for s in snaps:
+            s.frozen = False
+            s.write_to_db(db_controller.kv_store)
+        return True
 
-    def create_lvol(self, snap: Snapshot):
-            name = snap.target_lvs_name + "/" + snap.bdev_name
-            if snap.status == ObjectMigrationState.NEW:
-                snap_uuid= get_lvol_by_name(name)
-                if not snap_uuid:
-                   snap_uuid = self.m.rpc_client2.create_lvol(name, snap.size, snap.target_lvs_name,
-                                                         self.m.main_logical_volume.priority_class,
-                                                         self.m.main_logical_volume.ndcs,
-                                                         self.m.main_logical_volume.npcs)
-                if snap_uuid:
-                    snap.target_uuid = snap_uuid
-                    snap.status = ObjectMigrationState.LVOL_CREATED
-                    self.m.write_to_db(self.db_controller.kv_store)
-                else:
-                    raise exception(f"could not create lvol on target. snap: {snap.uuid}")
-            return True
+    def complete_snapshot_migration(self):
+        tr=db_controller.kv_store.create_transaction()
+        #snapshot objects are always create new, while lvols are really migrated
+        for s in self.m.snapshots:
+            if s.status==ObjectMigrationState.DONE:
+                snapshot = copy.copy(s.snap)
+                snapshot.uuid = str(uuid.uuid4())
+                snapshot.snap_uuid = s.target_uuid
+                snapshot.node_id=self.m.node_pri.get_id()
+                snapshot.write_to_db(db_controller.kv_store,tr)
+
+        lvol = copy.copy(self.m.vol.lvol)
+        lvol.node_id=self.m.node_pri.get_id()
+        lvol.lvol_bdev=self.m.vol.lvol.lvol_bdev
+        lvol.blobid=self.m.vol.lvol.blobid
+        lvol.lvol_uuid=self.m.vol.lvol.lvol_uuid
+        lvol.lvs_name=self.m.vol.lvol.lvs_name
+        lvol.write_to_db(db_controller.kv_store,tr)
+        try:
+          tr.commit.wait()
+        except:
+          raise RuntimeError(f"migration {self.m.uuid}: error updating snapshots and volumes in db.")
+        return True
 
-    def set_mig_status(self, snap: Snapshot):
-            name = snap.target_lvs_name + "/" + snap.bdev_name
-            if snap.status == ObjectMigrationState.LVOL_CREATED:
-                if not self.m.rpc_client2.lvol_set_migration_flag(name):
-                    raise f'issue creating an target object during migration of snapshot {snap.uuid} '
-                else:
-                    snap.status = ObjectMigrationState.MIG_FLAG_SET
-                    self.m.write_to_db(self.db_controller.kv_store)
+    def create_lvol(self, node: StorageNode, snap: Snapshot):
+            client = self.get_rpc_client(node)
+            name = node.lvstore + "/" + snap.snap.snap_bdev
+            snap_uuid = client.lvol_exists(node.lvstore,node.lvstore+"/"+snap.snap.snap_bdev)
+            if not snap_uuid or "error" in snap_uuid:
+                     snap_uuid = client.create_lvol(name, snap.snap.size, self.m.target_node_pri.lvstore,
+                                                         self.m.vol.lvol.lvol_priority_class,
+                                                         self.m.vol.lvol.ndcs,
+                                                         self.m.vol.lvol.npcs)
+                     self.raise_exception_on_error(snap_uuid,f"could not create lvol on target: {snap.snap.uuid}")
+            snap.target_uuid = snap_uuid["result"]
             return True
 
-    def check_online_and_leader(self, node: StorageNode):
-        if node.uuid==self.m.node_pri.uuid:
-            client=self.m.rpc_client1
-        elif node.uuid==self.m.target_node_pri.uuid:
-            client=self.m.rpc_client3
-        elif node.uuid==self.m.node_sec.uuid:
-            client=self.m.rpc_client2
-        elif node.uuid==self.m.target_node_sec.uuid:
-            client = self.m.rpc_client4
-        else:
-            raise f"migration: invalid node, cannot cleanup: {self.m.uuid}"
-        if node.status!=StorageNode.STATUS_ONLINE:
-            raise f"migration: node not online, cannot cleanup: {self.m.uuid}"
-        return client
+    def set_mig_status(self, node: StorageNode, snap: Snapshot):
+            client = self.get_rpc_client(node)
+            name = self.m.target_node_pri.lvstore + "/" + snap.snap.snap_bdev
+            ret=client.bdev_lvol_set_migration_flag(name)
+            self.raise_exception_on_error(ret, f"issue creating an target object during migration of snapshot {snap.uuid}")
+            snap.status = ObjectMigrationState.MIG_FLAG_SET
+            self.m.write_to_db(self.db_controller.kv_store)
+            return True
 
-    def export_lvol(self, s: Snapshot):
-        client = self.check_online_and_leader(self.m.node_pri)
-        nqn=generate_nqn()
-        client.subsystem_create(nqn,"tmp-mig", "sb-internal", 1, 1)
-        client.nvmf_subsystem_add_ns(s.temporary_nqn,s.lvs_name+"/"+s.bdev_name)
-        if self.m.target_node_pri.active_rdma:
-            fabric="RDMA"
-        else:
-            fabric="TCP"
-        client.nvmf_subsystem_add_listener(s.temporary_nqn, fabric,self.m.target_node_pri.nvmf_port,
-                    self.m.target_node_pri.hostname, "optimized")
-        return
+    def export_lvol(self, node: StorageNode, nqn: str, s: Snapshot, anaState: str, namespaces: int, serial: str, model: str):
+        client = self.get_rpc_client(node)
+        #check if subsystem exists, namespace is added and listener exists
+        #nqn=generate_nqn()
+        ss,listener,ns=client.find_subsystem_by_nqn(nqn)
+        if not ss:
+             ret=client.subsystem_create(nqn,serial, model, 1, namespaces)
+             self.raise_exception_on_error(ret, f"could not list subsystem for lvol: {s.snap.uuid}")
+        if not ns:
+             ret=client.nvmf_subsystem_add_ns(s.temporary_nqn,s.snap.lvol.lvs_name+"/"+s.snap.snap_bdev)
+             self.raise_exception_on_error(ret,f"could not list subsystem for lvol: {s.snap.uuid} ")
+        if not listener:
+            if self.m.target_node_pri.active_rdma:
+               fabric="RDMA"
+            else:
+               fabric="TCP"
+            ret=client.nvmf_subsystem_add_listener(s.temporary_nqn, fabric,self.m.target_node_pri.nvmf_port,
+                    self.m.target_node_pri.hostname, anaState)
+            self.raise_exception_on_error(ret, f"could not list subsystem for lvol: {s.snap.uuid}")
+        return True
 
     #delete subystem only, if there is only zero or one namespaces left;
     #if one namespace is left, it must match the volume
-    def delete_nqn_and_namespace(self, node: StorageNode, nqn:str, lvol: LVol):
-        client=self.check_online_and_leader(node)
+    def delete_subsystem(self, node: StorageNode, nqn:str, lvol: LVol):
+        client=self.get_rpc_client(node)
         data=client.subsystem_list(nqn)
+        if not data:
+            return False
+        ret = None
         for subsystem in data['result']:
             # Check if the subsystem has namespaces
-            namespaces = subsystem.get('namespaces')
-            if len(namespaces)==1:
-               for ns in namespaces:
-                   if ns['nsid']==lvol.namespace:
-                       client.subsystem_delete(nqn)
-            if len(namespaces)==0:
-                client.subsystem_delete(nqn)
-        return
-
-
-    def connect_lvol(self, s: Snapshot):
+            namespaces = subsystem.get('namespaces', None)
+            if not namespaces or len(namespaces<2):
+                   ret=client.subsystem_delete(nqn)
+                   self.raise_exception_on_error(data, f"could not delete subsystem: {nqn} for lvol: {lvol.uuid}")
+            elif len(namespaces>1):
+                client.nvmf_subsystem_remove_ns(nqn,lvol.namespace)
+        return True
 
-        return
+    def connect_lvol(self, node: StorageNode, s: Snapshot):
+        client = self.get_rpc_client(node)
+        if node.active_rdma:
+            transport="RDMA"
+        else:
+            transport="TCP"
+        ret=client.nvmf_get_subsystems()
+        subsystem = None
+        if ret and not "error" in ret:
+            subsystem = next((s for s in ret["result"] if s["nqn"] == s.temporary_nqn), None)
+        attach=True
+        if subsystem:
+            attach=False
+            first_namespace_name = subsystem.get("namespaces", [{}])[0].get("name")
+            if first_namespace_name == None:
+                client.bdev_nvme_detach_controller(s.snap.snap_bdev)
+                self.raise_exception_on_error(ret, f"could not remove remote controller: {s.snap.uuid}")
+            attach=True
+        if attach:
+           ret = client.bdev_nvme_attach_controller(s.snap.snap_bdev,s.temporary_nqn,node.hostname,node.nvmf_port,transport)
+           self.raise_exception_on_error(ret, f"could not connect lvol: {s.snap.uuid}")
+           s.controller = ret[0]
+        return True
 
-    def delete_lvol_from_node(self, node: StorageNode, oid: str, deleteType: Boolean):
-        client=self.check_online_and_leader(node)
+    def delete_lvol_from_node(self, node: StorageNode, oid: str, deleteType: bool):
+        client=self.get_rpc_client(node)
         lvol=db_controller.get_lvol_by_id(oid)
         if lvol:
-           client.delete_lvol(lvol.lvs_name+"/"+lvol.lvol_name, deleteType)
+           ret=client.delete_lvol(lvol.lvs_name+"/"+lvol.lvol_name, deleteType)
         else:
            snap=db_controller.get_snapshot_by_id(oid)
-           client.delete_lvol(snap.lvol.lvs_name + "/" + snap.lvol.lvol_name, deleteType)
+           ret=client.delete_lvol(snap.lvol.lvs_name + "/" + snap.lvol.lvol_name, deleteType)
+        self.raise_exception_on_error(ret, f"could not delete snapshot/lvol: {oid} ")
         return
 
-    def transfer_data(self, snap: Snapshot, offset: int):
-            self.m.completion_poll_queue.append(snap)
-            return
+    def transfer_data(self, node: StorageNode, snap: Snapshot, offset: int):
+        try:
+          client = self.get_rpc_client(node)
+          ret=client.bdev_lvol_transfer(snap.snap.lvol.lvs_name+"/"+snap.snap.snap_bdev,offset,4,snap.controller, "migrate")
+          self.raise_exception_on_error(ret, f"could not transfer data: {snap.snap.uuid} ")
+        except Exception as e:
+            logger.error(e)
+            return False
+        return True
 
     def convert_lvol(self, s: Snapshot):
-            client=self.check_online_and_leader(self.m.target_node_pri)
-            client.
-            return
-
-    def convert_to_snap(self, s1, s2: Snapshot):
-            return
+        client=self.get_rpc_client(self.m.target_node_pri)
+        ret=client.bdev_lvol_convert(s.snap.lvol.lvs_name+"/"+s.snap.snap_bdev)
+        if ret and "exists" in ret:
+            return True
+        self.raise_exception_on_error(ret, f"could not convert lvol to snapshot: {s.snap.uuid} to remote subsystem:")
+        return True
 
     def time_difference(self):
-           return (datetime.now()-self.prev_time).total_seconds()
+        return (datetime.now()-self.prev_time).total_seconds()
 
     def create_target_lvol(self, s: Snapshot):
-          return
+        client = self.get_rpc_client(self.m.target_node_pri)
+        ret=client.create_lvol(s.snap.snap_bdev,s.snap.size,self.m.target_node_pri.lvstore,self.m.vol.lvol.lvol_priority_class,self.m.vol.lvol.ndcs,self.m.vol.lvol.npcs)
+        self.raise_exception_on_error(ret, f"could not create target lvol for snapshot:{s.snap.uuid}")
+        return True
 
-    def create_final_lvol(self):
-        return
+    def create_target_lvol2(self, node: StorageNode, l: LogicalVolumeRef):
+        client = self.get_rpc_client(node)
+        if l.lvol.crypto_bdev != "":
+               client.lvol_crypto_create(l.lvol.crypto_bdev,l.lvol.lvol_bdev,l.lvol.crypto_key_name)
+        ret = client.create_lvol(l.lvol.lvol_bdev, l.lvol.size, node.lvstore, l.lvol.lvol_priority_class, l.lvol.ndcs, l.lvol.npcs)
+        ret=client.create_lvol(l.lvol.lvol_bdev,l.lvol.size,node.lvstore,l.lvol.lvol_priority_class,l.lvol.ndcs,l.lvol.npcs)
+        self.raise_exception_on_error(ret, f"could not create target lvol for main lvol:{l.lvol.uuid}")
+        return True
 
-    def connect_hublvol(self):
-          return
+    def connect_hublvol(self, node: StorageNode):
+        client = self.get_rpc_client(node)
+        if node.active_rdma:
+            fabric="RDMA"
+        else:
+            fabric="TCP"
+
+        ret=client.bdev_nvme_controller_list("migratelvol")
+        if not ret:
+           ret=client.bdev_nvme_attach_controller("migratelvol",node.hublvol,node.hostname,node.nvmf_port,fabric)
+           self.raise_exception_on_error(ret, f"could not attach controller for {self.m.vol.lvol.uuid} for hublvol")
+
+        return True
 
     def transfer_data_final(self):
-          return
+        client1 = self.get_rpc_client(self.m.node_pri)
+        client2 = self.get_rpc_client(self.m.target_node_sec)
+        client3 = self.get_rpc_client(self.m.target_node_pri)
+        uuid, map_id = client3.lvol_exists(self.m.target_node_pri,self.m.vol)
+        if not uuid:
+             self.create_target_lvol2(self.m.target_node_pri,self.m.vol)
+             uuid1, _ = client2.lvol_exists(self.m.target_node_sec, self.m.vol)
+             if not uuid1:
+                ret=client2.bdev_lvol_register(self.m.vol.lvol.lvol_bdev,self.m.target_node_sec.lvstore, self.m.vol.lvol.blobid, self.m.vol.lvol.lvol_uuid)
+                self.raise_exception_on_error(ret, f"could not register on secondary {self.m.vol.lvol.uuid}")
+
+        self.connect_hublvol(self.m.node_pri)
+
+        uuid, map_id = client3.lvol_exists(self.m.target_node_pri.lvstore,self.m.vol.lvol.lvol_bdev)
+        if not uuid or not map_id:
+            raise  RuntimeError(
+                f"migration {self.m.uuid}: could not get mapid of volume: {self.m.vol.lvol.uuid}")
+        last_snap_uuid = (self.m.snapshots)[-1].snap.snap_uuid
+        ret = client1.bdev_lvol_final_migration(self.m.vol.lvol.lvol_bdev,map_id,
+                                                last_snap_uuid,4,self.m.target_node_pri.hublvol.nqn)
+        self.raise_exception_on_error(ret, f"could not initiate final lvol migration: {self.m.vol.lvol.uuid}")
+        return True
+
+    def delete_hublvol_controller(self):
+        return
 
     def reconnect_subsystems(self):
-         return
 
-    def set_mig_state_lvol(self, s: Snapshot):
-         return
+        #if "error" in ret:
+        #    raise f"migration {self.m.uuid}: could not convert lvol to snapshot: {s.uuid} to remote subsystem:  {ret["error"]["message"]}:{ret["error"]["code"]}"
+        return
 
     def cleanup_migration(self, status: bool):
         db_controller = DBController()
@@ -329,7 +421,7 @@ def cleanup_migration(self, status: bool):
         #Migration was not successful
         try:
           if self.m.status >= MigrationState.HUBLVOL_CONNECTED:
-              ret= delete_hub_lvol_controller()
+              self.delete_hublvol_controller()
           if not status:
               pri_node=self.m.node_pri
               sec_node=self.m.node_sec
@@ -338,20 +430,23 @@ def cleanup_migration(self, status: bool):
               sec_node = self.m.target_node_sec
 
           if (self.m.status >= MigrationState.TARGET_LVOL_CREATED and not status) or self.m.status == MigrationState.DONE:
-              self.delete_nqn_and_namespace(pri_node, self.m.main_logical_volume.uuid)
-              self.delete_nqn_and_namespace(sec_node, self.m.main_logical_volume.uuid)
-              self.delete_lvol_from_node(pri_node, self.m.main_logical_volume.uuid)
-              self.unregister_lvol_from_node(sec_node, self.m.main_logical_volume.uuid)
+              self.delete_subsystem(pri_node, self.m.vol.lvol.nqn, self.m.vol.lvol)
+              self.delete_subsystem(sec_node, self.m.vol.lvol.uuid, )
+              self.delete_lvol_from_node(pri_node, self.m.vol.lvol.uuid, True)
+              self.(sec_node, self.m.vol.lvol.uuid)
 
           snaps = self.m.snapshots
           snaps.reverse()
           for sn in snaps:
-                     if sn.uuid:
-                        rsn = db_controller.get_snapshot_by_id(sn.uuid)
+                     if sn.snap.uuid:
+                        rsn = db_controller.get_snapshot_by_id(sn.snap.uuid)
                         if len(rsn.successor)==1:
-                            self.delete_lvol_from_node(pri_node, sn.uuid)
-                            self.delete_nqn_and_namespace(pri_node,sn.uuid)
-                            self.delete_lvol_from_node(sec_node, sn.uuid)
+
+
+
+                            self.delete_lvol_from_node(pri_node, sn.snap.uuid, True)
+                            self.delete_subsystem(pri_node,sn.snap.uuid)
+                            self.delete_lvol_from_node(sec_node, sn.snap.uuid)
                         else:
                             break
         except:
@@ -361,7 +456,7 @@ def cleanup_migration(self, status: bool):
     def migrate_final_lvol(self):
       try:
         if self.m.status==MigrationState.SNAPS_MIGRATED:
-           self.create_final_lvol()
+           self.transfer_data_final()
         elif self.m.status==MigrationState.TARGET_LVOL_CREATED:
            self.connect_hublvol()
         elif self.m.status==MigrationState.HUBLVOL_CONNECTED:
@@ -371,7 +466,7 @@ def migrate_final_lvol(self):
         elif self.m.status == MigrationState.RECONNECT_DONE:
            self.cleanup_migration(True)
       except:
-        raise f"cannot transfer to target: {self.m.main_logical_volume.uuid}"
+        raise f"cannot transfer to target: {self.m.vol.lvol.uuid}"
       return True
 
     def migrate_snaps(self):
@@ -385,7 +480,7 @@ def migrate_snaps(self):
               if s.status in ObjectMigrationState.NEW:
                   self.create_target_lvol(s)
               elif s.status in ObjectMigrationState.LVOL_CREATED:
-                  self.set_mig_state_lvol(s)
+                  self.set_mig_status(self.m.target_node_pri,s)
               elif s.status in ObjectMigrationState.MIG_FLAG_SET:
                   self.export_lvol(s)
               elif s.status in ObjectMigrationState.LVOL_EXPORTED:
@@ -393,15 +488,15 @@ def migrate_snaps(self):
               elif s.status in ObjectMigrationState.LVOL_CONNECTED:
                   self.transfer_data(s, 0)
               elif s.status==ObjectMigrationState.TRANSFERRED:
-                   self.convert_to_snap(s,p)
+                   self.convert_lvol(s,p)
               elif s.status == ObjectMigrationState.CONVERTED:
-                   self.delete_nqn_and_namespace(self.m.target_node_pri,s.uuid)
+                   self.delete_subsystem(self.m.target_node_pri,s.snap.uuid)
               elif s.status == ObjectMigrationState.CLEANING:
-                   self.delete_lvol_from_node(self.m.target_node_sec, s.uuid)
+                   self.delete_lvol_from_node(self.m.target_node_sec, s.snap.uuid)
               p=s
             if self.m.rerun < 3 or self.time_difference()>5:
-                ret, snap_uuid=create_snapshot(self.m.main_logical_volume)
-                sn= snap_init(snap_uuid, self.m.main_logical_volume, self.m.target_node_pri.lvstore)
+                ret, snap_uuid=self.create_snapshot(self.m.vol)
+                sn=self.snap_assign(self.m.vol,snap_uuid)
                 self.m.snapshots.append(sn)
                 self.prev_time=datetime.now()
                 self.migrate_snaps()
@@ -415,33 +510,35 @@ def migrate_snaps(self):
                self.cleanup_migration(False)
         return True
 
-    def lvol_migrate(self, lvol: LogicalVolumeRef, target_node: StorageNode, m: MigrationObject=None):
+    def lvol_migrate(self, lvol: LVol, target_node: StorageNode, m: MigrationObject=None):
         """Migrate a logical volume and its snapshots/clones."""
 
         # if this Migration Object does not exist (first call to lvol_migrate):
         if not m:
+          try:
             self.m = MigrationObject()
             self.m.uuid = str(uuid.uuid4())
             self.m.create_dt = str(datetime.datetime)
             self.m.status = MigrationState.NEW
             self.m.write_to_db(self.db_controller.kv_store)
+          except:
+              return False #not even in database, lvol_migrate call must be repeated
         else:
             self.m = m
 
-        # update lvol: frozen means it cannot be deleself.m.main_logical_volume ted or resized. new snapshots cannot be taken.
+        # freeze lvols and snapshots during migration
         try:
-            lvol1=self.db_controller.get_lvol_by_id(lvol.uuid)
-            lvol1.frozen = True
-            lvol1.write_to_db(self.db_controller.kv_store)
+            lvol.frozen = True
+            lvol.write_to_db(self.db_controller.kv_store)
 
             # copy now all data from the lvol to the migration lvol (temporary object for lvol during migration)
 
-            self.m.node_pri = StorageNode(self.db_controller.get_storage_node_by_id(lvol1.node_id))
+            self.m.node_pri = StorageNode(self.db_controller.get_storage_node_by_id(lvol.node_id))
             self.m.node_sec = self.db_controller.get_storage_node_by_id(self.m.node_pri.secondary_node_id)
             self.m.target_node_pri = target_node
             self.m.target_node_sec = self.db_controller.get_storage_node_by_id(self.m.target_node_pri.secondary_node_id)
 
-            self.m.main_logical_volume = lvol_assign(lvol1,self.m.node_pri.lvstore)
+            self.m.vol = self.lvol_assign(lvol)
 
             # get all 4 storage node objects: primary, secondary source and target
 
@@ -454,16 +551,20 @@ def lvol_migrate(self, lvol: LogicalVolumeRef, target_node: StorageNode, m: Migr
             self.m.snapshots = []
             sr = None
             for s in snapshots:
-                if s.lvol.uuid == self.m.main_logical_volume.uuid:
+                if s.lvol.uuid == self.m.vol.lvol.uuid:
                     s.frozen = True
                     # need to reset that one on node restart
                     s.write_to_db(self.db_controller.kv_store)
-                    sr = snap_assign(self.m.main_logical_volume, s,  self.m.target_node_pri.lvstore)
+                    sr = self.snap_assign(self.m.vol, s)
                     self.m.snapshots.append(sr)
         except:
-            return False
+            return True
+        self.m.status=MigrationState.RUNNING
+        self.m.write_to_db(self.db_controller.kv_store)
+        self.migrate_snaps()
+        return True
 
-        if check_nodes_online(self.m.node_pri, self.m.node_sec, self.m.target_node_pri, self.m.target_node_sec):
+        if self.check_nodes_online():
             self.m.status = MigrationState.RUNNING
             self.m.write_to_db(self.db_controller.kv_store)
             self.migrate_snaps()
@@ -471,7 +572,7 @@ def lvol_migrate(self, lvol: LogicalVolumeRef, target_node: StorageNode, m: Migr
         else:
             logger.warning(f"Not all nodes online. Suspending lvol life migration {lvol.uuid}")
             self.m.write_to_db(self.db_controller.kv_store)
-            return False
+            return -1
 
     def check_status_migration(self, on_restart: bool):
       while True:
@@ -480,25 +581,22 @@ def check_status_migration(self, on_restart: bool):
             migrations=self.db_controller.get_migrations()
             for m in migrations:
               if m.status!=MigrationState.DONE and m.status!=MigrationState.FAILED:
-                 if check_nodes_online(m.node_pri,self.db_controller.get_storage_node_by_id(m.node_pri.secondary_node_id),
-                                            m.target_node_pri,m.target_node_sec):
+                 if self.check_nodes_online():
                      if m.status==MigrationState.NEW:
-                         self.lvol_migrate(m.main_logical_volume,m.node_pri,m)
+                         self.lvol_migrate(m.vol.lvol,m.node_pri,m)
                      elif m.status==MigrationState.RUNNING:
                          for q in m.completion_poll_queue:
                              m.completion_poll_queue.remove(q)
                              if q.status==ObjectMigrationState.TRANSFER:
-                                 if q.retry>5:
-                                     raise f"could not transfer snapshot. max retries. name: {q.lvs_name + "/" + q.bdev_name}. uuid: {q.uuid}"
-                                 q.retry+=1
-                                 result, offset = get_transfer_state(q.target_lvs_name + "/" + q.bdev_name, self.m.node_pri.uuid)
+                                 result, offset = self.get_transfer_state(self.m.node_pri,q.retry)
                                  if not result:
-                                    self.transfer_data(q,offset)
-                                    m.completion_poll_queue.append(q)
-                                 else:
-                                    q.status=ObjectMigrationState.TRANSFERRED
-                             self.migrate_snaps()
-                     elif m.status in (MigrationState.SNAPS_MIGRATED, MigrationState.HUBLVOL_CONNECTED, MigrationState.TARGET_LVOL_CREATED, MigrationState.TRANSFERRED_TO_TARGET, MigrationState.RECONNECT_DONE):
+                                     if q.retry > 5:
+                                         raise f"could not transfer snapshot. max retries. name: {q.snap.lvol.lvs_name + "/" + q.snap.snap_bdev}. uuid: {q.snap.uuid}"
+                                     q.retry += 1
+                                     self.transfer_data(self.m.node_pri,q,offset)
+                                     m.completion_poll_queue.append(q)
+                         self.migrate_snaps()
+                     else:
                           self.migrate_final_lvol()
           except:
               logger.error(f"migration controller exception. Migration failed: {self.m.uuid} ")
@@ -514,21 +612,13 @@ def add_new_migration(self, lvol, target_node: StorageNode):
             try:
               migrations = self.db_controller.get_migrations()
               for m in migrations:
-                if lvol.node_id==m.main_logical_volume.node_id and (m.status!=MigrationState.DONE or m.status!=MigrationState.FAILED_AND_CLEANED):
+                if lvol.node_id==m.vol.lvol.node_id and (m.status!=MigrationState.DONE or m.status!=MigrationState.FAILED_AND_CLEANED):
                    raise exception("cannot add migration - ongoing migration")
               self.lvol_migrate(lvol, target_node)
             except:
               logger.error(f"could not add lvol {lvol.uuid} for migration as another migration is currently running.")
               return False
-
-        #are all 4 nodes online?
-        #if migration is suspended, resume it. If it was before in
-        #depending on previous state, continue in migrate_snaps, migrate_lvol or cleanup
-        #did total time expire? --> cleanup, failed
-        #any snaps in queue?
-        #poll for completion, trigger restart or if completed change the state
-        #stop
-      return None
+            return self.lvol_migrate(lvol,target_node)
 
     def start_service(self, on_restart=False):
         """
diff --git a/simplyblock_core/models/lvol_migration.py b/simplyblock_core/models/lvol_migration.py
index 01b18fc74..f8cfe456a 100644
--- a/simplyblock_core/models/lvol_migration.py
+++ b/simplyblock_core/models/lvol_migration.py
@@ -4,6 +4,8 @@
 from typing import List
 import storage_node
 from base_model import *
+from simplyblock_core.models.lvol_model import LVol
+from simplyblock_core.models.snapshot import SnapShot
 from simplyblock_core.rpc_client import RPCClient
 
 
@@ -48,42 +50,21 @@ class ObjectMigrationState(str, Enum):
 @dataclass
 class LogicalVolumeRef:
     """Reference to a logical volume participating in a migration."""
-    uuid: str = ""
-    bdev_name: str = ""  # "LVS/LV"
-    lvs_name: str = ""
-    target_lvs_name: str = ""
-    source_uuid: str = ""
+    lvol: LVol = None
     target_uuid: str = ""
-    namespace_id: str = ""
-    nqn : str = ""
-    node_id: str = ""
-    sec_node_id :str =""
-    target_node_id : str = ""
-    target_sec_node_id : str = ""
-    ndcs : int = 1
-    npcs : int = 1
-    priority_class : int = 0
-    size : int = 0
     mapid: int = 0
-    cloned : str = ""
     state : ObjectMigrationState = ObjectMigrationState.NEW
     retry : int = 0
-    crypto_bdev_name: str = ""
 
 @dataclass
 class Snapshot:
-    uuid : str =""
-    bdev_name: str = "" # "LVS/LV"
-    lvs_name: str = ""
-    size: int = 0
-    target_lvs_name : str = ""
-    source_uuid : str = ""
-    target_uuid : str = ""
+    snap: SnapShot = None
+    lvol: LogicalVolumeRef = None
+    controller: str = ""
+    target_uuid: str = ""
     retry : int = 0
     # Migration metadata
     temporary_nqn: str = ""
-    temporary_namespace: str = ""
-    mapid: int = 0
     status: ObjectMigrationState = ObjectMigrationState.NEW
 
 @dataclass
@@ -95,7 +76,7 @@ class MigrationObject(BaseModel):
     status: MigrationState = MigrationState.NEW
     pre_status: MigrationState = MigrationState.NEW
 
-    main_logical_volume : LogicalVolumeRef = None
+    vol : LogicalVolumeRef = None
     node_pri : storage_node.StorageNode = None
     node_sec: storage_node.StorageNode = None
     target_node_pri: storage_node.StorageNode = None
diff --git a/simplyblock_core/models/lvol_model.py b/simplyblock_core/models/lvol_model.py
index baf69477c..b8b06bc37 100644
--- a/simplyblock_core/models/lvol_model.py
+++ b/simplyblock_core/models/lvol_model.py
@@ -5,8 +5,48 @@
 from simplyblock_core.models.base_model import BaseModel
 from simplyblock_core.models.nvme_device import NVMeDevice
 
+import copy
 
 class LVol(BaseModel):
+    # your class-level constants and annotations...
+
+    def __copy__(self):
+        """
+        Shallow copy:
+        - BaseModel fields copied
+        - Mutable fields (list, dict, set) shallow-copied so references do NOT leak
+        """
+        new = type(self)()
+
+        for attr in self.get_attrs_map():
+            value = getattr(self, attr)
+
+            # Copy containers to avoid unintended shared references
+            if isinstance(value, (dict, list, set)):
+                value = value.copy()
+
+            setattr(new, attr, value)
+
+        return new
+
+    def __deepcopy__(self, memo):
+        """
+        Deep copy:
+        - Recursively duplicates everything
+        - Uses memo to avoid infinite recursion
+        """
+        if id(self) in memo:
+            return memo[id(self)]
+
+        new = type(self)()
+        memo[id(self)] = new
+
+        for attr in self.get_attrs_map():
+            value = getattr(self, attr)
+            setattr(new, attr, copy.deepcopy(value, memo))
+
+        return new
+
 
     STATUS_IN_CREATION = 'in_creation'
     STATUS_ONLINE = 'online'
@@ -20,7 +60,6 @@ class LVol(BaseModel):
         STATUS_IN_CREATION: 4,
     }
 
-    base_bdev: str = ""
     bdev_stack: List = []
     blobid: int = 0
     cloned_from_snap: str = ""
diff --git a/simplyblock_core/models/snapshot.py b/simplyblock_core/models/snapshot.py
index 0e76b56dd..378d13b89 100644
--- a/simplyblock_core/models/snapshot.py
+++ b/simplyblock_core/models/snapshot.py
@@ -3,7 +3,7 @@
 from simplyblock_core.models.base_model import BaseModel
 from simplyblock_core.models.lvol_model import LVol
 from typing import List
-
+import copy
 
 class SnapshotRef():
 
@@ -16,7 +16,6 @@ class SnapshotRef():
 
 class SnapShot(BaseModel):
 
-
     STATUS_ONLINE = 'online'
     STATUS_OFFLINE = 'offline'
     STATUS_IN_DELETION = 'in_deletion'
@@ -32,8 +31,8 @@ class SnapShot(BaseModel):
     ref_count: int = 0
     size: int = 0
     used_size: int = 0
-    snap_bdev: str = ""
-    snap_name: str = ""
+    snap_bdev: str = "" #snapshot relative name (part without lvstore)
+    snap_name: str = "" #snapshot full name
     snap_ref_id: str = ""
     snap_uuid: str = ""
     vuid: int = 0
@@ -44,3 +43,30 @@ class SnapShot(BaseModel):
     node_id : str = ""
     successor : List[SnapshotRef] = []
     predecessor: str = ""
+
+    def __copy__(self):
+        # Shallow copy
+        # 1. Copy base class attributes
+        base_copy = super().__copy__() if hasattr(super(), '__copy__') else type(self)()
+
+        # 2. Copy derived attributes
+        new = type(self)()
+        for attr in self.get_attrs_map():
+            value = getattr(self, attr)
+            # Shallow copy for mutable types
+            if isinstance(value, (dict, list, set)):
+                value = value.copy()
+            setattr(new, attr, value)
+        return new
+
+    def __deepcopy__(self, memo):
+        # 1. Deep copy base attributes
+        base_copy = super().__deepcopy__(memo) if hasattr(super(), '__deepcopy__') else type(self)()
+
+        # 2. Deep copy derived attributes
+        new = type(self)()
+        memo[id(self)] = new
+        for attr in self.get_attrs_map():
+            value = getattr(self, attr)
+            setattr(new, attr, copy.deepcopy(value, memo))
+        return new
diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py
index 75ab0c3d0..886695974 100644
--- a/simplyblock_core/rpc_client.py
+++ b/simplyblock_core/rpc_client.py
@@ -1230,7 +1230,7 @@ def nvmf_port_unblock_rdma(self, port):
     def nvmf_get_blocked_ports_rdma(self):
         return self._request("nvmf_get_blocked_ports")
 
-    def lvol_final_migration(
+    def bdev_lvol_final_migration(
             self,
             lvol_name: str,
             lvol_id: str,
@@ -1242,44 +1242,84 @@ def lvol_final_migration(
             "lvol_name": lvol_name,
             "lvol_id": lvol_id,
             "snapshot_name": snapshot_name,
-            "b": batch,
-            "g": nqn
+            "cluster_batch": batch,
+            "gateway": nqn
         }
         return self._request("bdev_lvol_final_migration", params)
 
-    def lvol_set_migration_flag(self, lvol_name: str):
+    def bdev_lvol_set_migration_flag(self, lvol_name: str):
         params = {
             "lvol_name": lvol_name
         }
         return self._request("bdev_lvol_set_migration_flag", params)
 
-    def lvol_convert(self, lvol_name: str):
+    def bdev_lvol_convert(self, lvol_name: str):
         params = {
             "lvol_name": lvol_name
         }
         return self._request("bdev_lvol_convert", params)
 
-    def lvol_add_clone(self, clone_name: str, source_lvol_name: str):
+    def bdev_lvol_add_clone(self, clone_name: str, source_lvol_name: str):
         params = {
             "clone_name": clone_name,
-            "source_lvol_name": source_lvol_name
+            "child_name": source_lvol_name
         }
         return self._request("bdev_lvol_add_clone", params)
 
 
-def lvol_transfer(
+    def bdev_lvol_transfer(self, lvolname: str,o: int,batch: int,gw: str, op: str):
+      params = {
+          "lvol_name" : lvolname,
+          "offset" : o,
+          "cluster_batch" : batch,
+          "gateway"  :gw,
+          "operation" :  op,
+      }
+      return self._request("bdev_lvol_transfer", params)
+
+    def bdev_lvol_get_lvols(
+        self,
+        lvs: str):
+      params = {
+        "lvs_uuid": lvs,
+      }
+      return self._request("bdev_lvol_get_lvols", params)
+
+    def bdev_lvol_transfer_stat(
         self,
-        lvol_name: str,
-        offset: int,
-        batch: int,
-        nqn: str,
-        O: str
-):
-    params = {
-        "n": lvol_name,
-        "o": offset,
-        "b": batch,
-        "g": nqn,
-        "O": O
-    }
-    return self._request("bdev_lvol_transfer", params)
\ No newline at end of file
+        name: str):
+      params = {
+        "lvol_name": name,
+      }
+      return self._request("bdev_lvol_transfer_stat", params)
+
+    def lvol_exists(self, lvs_name, name):
+        params = {
+            "lvs_name": lvs_name,
+        }
+        ret = self._request("bdev_lvol_get_lvols", params)
+        if not ret or "error" in ret:
+            raise RuntimeError(ret["error"])
+        for lvol in ret["result"]:
+            if lvol.get("name") == name:
+                if lvol["map_id"]:
+                   return lvol["uuid"],lvol["map_id"]
+                else:
+                    return lvol["uuid"],None
+        return None, None
+
+    def nvmf_get_subsystems(
+        self):
+      params = {
+      }
+      return self._request("bdev_lvol_transfer_stat", params)
+
+    def find_subsystem_by_nqn(self, nqn_to_find):
+        subsystems = self.nvmf_get_subsystems()
+        for subsystem in subsystems:
+            if subsystem.get("nqn") == nqn_to_find:
+                listener = subsystem.get("listen_addresses", [None])[0] if subsystem.get("listen_addresses") else None
+                nsid = subsystem.get("namespaces", [None])[0].get("nsid") if subsystem.get("namespaces") else None
+                return subsystem, listener, nsid
+        return None, None, None
+
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index 4593ae97b..ae930b44f 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -134,7 +134,7 @@ def restart_migration(node:StorageNode):
     for m in migs:
         if m.node_pri==node.uuid:
           if m.status!=MigrationState.DONE:
-            #TODO: continue to run that migration by enabling the migration service.
+            add_task()
     return
 
 def get_next_cluster_device_order(db_controller, cluster_id):

From 340d1aa604342e6140ed9cb4a7a7bf2baaec9ae9 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 30 Dec 2025 14:42:48 +0300
Subject: [PATCH 63/68] fix cluster add apiv2 (#829)

---
 simplyblock_web/api/v2/cluster.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py
index 19e9dbbf4..49f8a09e8 100644
--- a/simplyblock_web/api/v2/cluster.py
+++ b/simplyblock_web/api/v2/cluster.py
@@ -63,8 +63,8 @@ def add(request: Request, parameters: ClusterParams):
     if not cluster_id_or_false:
         raise ValueError('Failed to create cluster')
 
-    entity_url = request.app.url_path_for('get', cluster_id=cluster_id_or_false)
-    return Response(status_code=201, headers={'Location': entity_url})
+    cluster = db.get_cluster_by_id(cluster_id_or_false)
+    return ClusterDTO.from_model(cluster)
 
 
 instance_api = APIRouter(prefix='/{cluster_id}')

From 7649052c59e32d4995923fb64f8c8db76e625ec2 Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Fri, 2 Jan 2026 23:46:00 +0300
Subject: [PATCH 64/68] Fix deps version (#832)

---
 simplyblock_core/scripts/install_deps.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/simplyblock_core/scripts/install_deps.sh b/simplyblock_core/scripts/install_deps.sh
index 256a55500..56d0bf96e 100644
--- a/simplyblock_core/scripts/install_deps.sh
+++ b/simplyblock_core/scripts/install_deps.sh
@@ -2,15 +2,15 @@
 
 if [[ "$1" == "docker" ]]; then
   sudo yum install -y yum-utils
-  sudo yum install -y https://repo.almalinux.org/almalinux/9/devel/aarch64/os/Packages/tuned-profiles-realtime-2.24.0-1.el9.noarch.rpm
+  sudo yum install -y https://repo.almalinux.org/almalinux/9/devel/aarch64/os/Packages/tuned-profiles-realtime-2.26.0-1.el9.noarch.rpm
   sudo yum install -y yum-utils xorg-x11-xauth nvme-cli fio tuned
 
   sudo yum install hostname pkg-config git wget python3-pip yum-utils \
     iptables pciutils -y
 
     sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
-    sudo yum install docker-ce docker-ce-cli \
-      containerd.io docker-buildx-plugin docker-compose-plugin -y
+    sudo yum install docker-ce-29.1.3-1.el9 docker-ce-cli-29.1.3-1.el9 \
+      containerd.io-2.2.0-2.el9 docker-buildx-plugin-0.30.1-1.el9 docker-compose-plugin-5.0.1-1.el9 -y
 
   sudo systemctl enable docker
   sudo systemctl start docker

From 3b74ce12f61804281878934ac119872ae9f7b9bf Mon Sep 17 00:00:00 2001
From: "Hamdy H. Khader" <hamdy.khader@gmail.com>
Date: Tue, 6 Jan 2026 16:13:14 +0300
Subject: [PATCH 65/68] fix id_device_by_nqn int+str (#833)

---
 simplyblock_core/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py
index 7bc2fa112..1f086dc2d 100644
--- a/simplyblock_core/utils/__init__.py
+++ b/simplyblock_core/utils/__init__.py
@@ -1103,7 +1103,7 @@ def addNvmeDevices(rpc_client, snode, devs):
             serial_number = nvme_driver_data['ctrlr_data']['serial_number']
             if snode.id_device_by_nqn:
                 if "ns_data" in nvme_driver_data:
-                    serial_number = nvme_driver_data['pci_address'] + nvme_driver_data['ns_data']['id']
+                    serial_number = nvme_driver_data['pci_address'] + str(nvme_driver_data['ns_data']['id'])
                 else:
                     logger.error(f"No subsystem nqn found for device: {nvme_driver_data['pci_address']}")
 

From 21cd2dae33bf9871c9a0e3d94d9b0339f000a161 Mon Sep 17 00:00:00 2001
From: hamdykhader <hamdy.khader@gmail.com>
Date: Tue, 6 Jan 2026 19:02:54 +0300
Subject: [PATCH 66/68] Adds openapi.json

---
 simplyblock_web/static/openapi.json | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 simplyblock_web/static/openapi.json

diff --git a/simplyblock_web/static/openapi.json b/simplyblock_web/static/openapi.json
new file mode 100644
index 000000000..3e2a05130
--- /dev/null
+++ b/simplyblock_web/static/openapi.json
@@ -0,0 +1 @@
+{"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/api/v2/clusters/":{"get":{"summary":"Clusters:List","operationId":"clusters_list_api_v2_clusters__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/ClusterDTO"},"title":"Response Clusters List Api V2 Clusters  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Create","operationId":"clusters_create_api_v2_clusters__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClusterParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/":{"get":{"summary":"Clusters:Detail","operationId":"clusters_detail_api_v2_clusters__cluster_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClusterDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Update","operationId":"clusters_update_api_v2_clusters__cluster_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableClusterParameters"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Delete","operationId":"clusters_delete_api_v2_clusters__cluster_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/capacity":{"get":{"summary":"Clusters:Capacity","operationId":"clusters_capacity_api_v2_clusters__cluster_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/iostats":{"get":{"summary":"Clusters:Iostats","operationId":"clusters_iostats_api_v2_clusters__cluster_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/logs":{"get":{"summary":"Clusters:Logs","operationId":"clusters_logs_api_v2_clusters__cluster_id__logs_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","default":50,"title":"Limit"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/start":{"post":{"summary":"Clusters:Start","operationId":"clusters_start_api_v2_clusters__cluster_id__start_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/shutdown":{"post":{"summary":"Clusters:Shutdown","operationId":"clusters_shutdown_api_v2_clusters__cluster_id__shutdown_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/activate":{"post":{"summary":"Clusters:Activate","operationId":"clusters_activate_api_v2_clusters__cluster_id__activate_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/update":{"post":{"summary":"Clusters:Upgrade","operationId":"clusters_upgrade_api_v2_clusters__cluster_id__update_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_UpdateParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/":{"get":{"summary":"Clusters:Storage-Nodes:List","operationId":"clusters_storage_nodes_list_api_v2_clusters__cluster_id__storage_nodes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/StorageNodeDTO"},"title":"Response Clusters Storage Nodes List Api V2 Clusters  Cluster Id  Storage Nodes  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Nodes:Create","operationId":"clusters_storage_nodes_create_api_v2_clusters__cluster_id__storage_nodes__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/StorageNodeParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/":{"get":{"summary":"Clusters:Storage-Nodes:Detail","operationId":"clusters_storage_nodes_detail_api_v2_clusters__cluster_id__storage_nodes__storage_node_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Nodes:Delete","operationId":"clusters_storage_nodes_delete_api_v2_clusters__cluster_id__storage_nodes__storage_node_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force_remove","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force Remove"}},{"name":"force_migrate","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force Migrate"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/capacity":{"get":{"summary":"Clusters:Storage-Nodes:Capacity","operationId":"clusters_storage_nodes_capacity_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/iostats":{"get":{"summary":"Clusters:Storage-Nodes:Iostats","operationId":"clusters_storage_nodes_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/nics":{"get":{"summary":"Clusters:Storage-Nodes:Nics:List","operationId":"clusters_storage_nodes_nics_list_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__nics_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/nics/{nic_id}/iostats":{"get":{"summary":"Clusters:Storage-Nodes:Nics:Iostats","operationId":"clusters_storage_nodes_nics_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__nics__nic_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"nic_id","in":"path","required":true,"schema":{"type":"string","title":"Nic Id"}},{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/suspend":{"post":{"summary":"Clusters:Storage-Nodes:Suspend","operationId":"clusters_storage_nodes_suspend_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__suspend_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/resume":{"post":{"summary":"Clusters:Storage-Nodes:Resume","operationId":"clusters_storage_nodes_resume_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__resume_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/shutdown":{"post":{"summary":"Clusters:Storage-Nodes:Shutdown","operationId":"clusters_storage_nodes_shutdown_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__shutdown_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/restart":{"post":{"summary":"Clusters:Storage-Nodes:Restart","operationId":"clusters_storage_nodes_restart_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__restart_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_RestartParams","default":{"force":false,"reattach_volume":false}}}}},"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/start":{"post":{"summary":"Clusters:Storage-Nodes:Start","operationId":"clusters_storage_nodes_start_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__start_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_RestartParams","default":{"force":false,"reattach_volume":false}}}}},"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/":{"get":{"summary":"Clusters:Storage Nodes:Devices:List","operationId":"clusters_storage_nodes_devices_list_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/DeviceDTO"},"title":"Response Clusters Storage Nodes Devices List Api V2 Clusters  Cluster Id  Storage Nodes  Storage Node Id  Devices  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/":{"get":{"summary":"Clusters:Storage Nodes:Devices:Detail","operationId":"clusters_storage_nodes_devices_detail_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/DeviceDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage Nodes:Devices:Delete","operationId":"clusters_storage_nodes_devices_delete_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/capacity":{"get":{"summary":"Clusters:Storage Nodes:Devices:Capacity","operationId":"clusters_storage_nodes_devices_capacity_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/iostats":{"get":{"summary":"Clusters:Storage Nodes:Devices:Iostats","operationId":"clusters_storage_nodes_devices_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/reset":{"post":{"summary":"Clusters:Storage Nodes:Devices:Reset","operationId":"clusters_storage_nodes_devices_reset_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__reset_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/":{"get":{"summary":"Clusters:Storage-Pools:List","operationId":"clusters_storage_pools_list_api_v2_clusters__cluster_id__storage_pools__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/StoragePoolDTO"},"title":"Response Clusters Storage Pools List Api V2 Clusters  Cluster Id  Storage Pools  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Create","operationId":"clusters_storage_pools_create_api_v2_clusters__cluster_id__storage_pools__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/StoragePoolParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/":{"get":{"summary":"Clusters:Storage-Pools:Detail","operationId":"clusters_storage_pools_detail_api_v2_clusters__cluster_id__storage_pools__pool_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/StoragePoolDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Delete","operationId":"clusters_storage_pools_delete_api_v2_clusters__cluster_id__storage_pools__pool_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Storage-Pools:Update","operationId":"clusters_storage_pools_update_api_v2_clusters__cluster_id__storage_pools__pool_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableStoragePoolParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/iostats":{"get":{"summary":"Clusters:Storage-Pools:Iostats","operationId":"clusters_storage_pools_iostats_api_v2_clusters__cluster_id__storage_pools__pool_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","default":20,"title":"Limit"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/":{"get":{"summary":"Clusters:Storage-Pools:Volumes:List","operationId":"clusters_storage_pools_volumes_list_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/VolumeDTO"},"title":"Response Clusters Storage Pools Volumes List Api V2 Clusters  Cluster Id  Storage Pools  Pool Id  Volumes  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Volumes:Create","operationId":"clusters_storage_pools_volumes_create_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RootModel_Union__CreateParams___CloneParams__"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Detail","operationId":"clusters_storage_pools_volumes_detail_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VolumeDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Storage-Pools:Volumes:Update","operationId":"clusters_storage_pools_volumes_update_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableLVolParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Volumes:Delete","operationId":"clusters_storage_pools_volumes_delete_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/inflate":{"post":{"summary":"Clusters:Storage-Pools:Volumes:Inflate","operationId":"clusters_storage_pools_volumes_inflate_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__inflate_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/connect":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Connect","operationId":"clusters_storage_pools_volumes_connect_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__connect_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/capacity":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Capacity","operationId":"clusters_storage_pools_volumes_capacity_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/iostats":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Iostats","operationId":"clusters_storage_pools_volumes_iostats_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/snapshots":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Snapshots:List","operationId":"clusters_storage_pools_volumes_snapshots_list_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__snapshots_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/SnapshotDTO"},"title":"Response Clusters Storage Pools Volumes Snapshots List Api V2 Clusters  Cluster Id  Storage Pools  Pool Id  Volumes  Volume Id  Snapshots Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Volumes:Snapshots:Create","operationId":"clusters_storage_pools_volumes_snapshots_create_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__snapshots_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_SnapshotParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/snapshots/":{"get":{"summary":"Clusters:Storage-Pools:Snapshots:List","operationId":"clusters_storage_pools_snapshots_list_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/SnapshotDTO"},"title":"Response Clusters Storage Pools Snapshots List Api V2 Clusters  Cluster Id  Storage Pools  Pool Id  Snapshots  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/snapshots/{snapshot_id}/":{"get":{"summary":"Clusters:Storage-Pools:Snapshots:Detail","operationId":"clusters_storage_pools_snapshots_detail_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__snapshot_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"snapshot_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Snapshot Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SnapshotDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Snapshots:Delete","operationId":"clusters_storage_pools_snapshots_delete_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__snapshot_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"snapshot_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Snapshot Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/management-nodes/":{"get":{"summary":"Management Nodes:List","operationId":"management_nodes_list_api_v2_management_nodes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/ManagementNodeDTO"},"title":"Response Management Nodes List Api V2 Management Nodes  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/management-nodes/{management_node_id}/":{"get":{"summary":"Management Node:Detail","operationId":"management_node_detail_api_v2_management_nodes__management_node_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"management_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Management Node Id"}},{"name":"cluster_id","in":"query","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ManagementNodeDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"ClusterDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"nqn":{"type":"string","title":"Nqn"},"status":{"type":"string","enum":["active","read_only","inactive","suspended","degraded","unready","in_activation","in_expansion"],"title":"Status"},"rebalancing":{"type":"boolean","title":"Rebalancing"},"block_size":{"type":"integer","minimum":0.0,"title":"Block Size"},"coding":{"prefixItems":[{"type":"integer","minimum":0.0},{"type":"integer","minimum":0.0}],"type":"array","maxItems":2,"minItems":2,"title":"Coding"},"ha":{"type":"boolean","title":"Ha"},"utliziation_critical":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Utliziation Critical"},"utilization_warning":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Utilization Warning"},"provisioned_cacacity_critical":{"type":"integer","minimum":0.0,"title":"Provisioned Cacacity Critical"},"provisioned_cacacity_warning":{"type":"integer","minimum":0.0,"title":"Provisioned Cacacity Warning"},"node_affinity":{"type":"boolean","title":"Node Affinity"},"anti_affinity":{"type":"boolean","title":"Anti Affinity"},"secret":{"type":"string","title":"Secret"}},"type":"object","required":["id","name","nqn","status","rebalancing","block_size","coding","ha","utliziation_critical","utilization_warning","provisioned_cacacity_critical","provisioned_cacacity_warning","node_affinity","anti_affinity","secret"],"title":"ClusterDTO"},"ClusterParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"blk_size":{"type":"integer","enum":[512,4096],"title":"Blk Size","default":512},"page_size_in_blocks":{"type":"integer","exclusiveMinimum":0.0,"title":"Page Size In Blocks","default":2097152},"cap_warn":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Cap Warn","default":0},"cap_crit":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Cap Crit","default":0},"prov_cap_warn":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Prov Cap Warn","default":0},"prov_cap_crit":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Prov Cap Crit","default":0},"distr_ndcs":{"type":"integer","title":"Distr Ndcs","default":1},"distr_npcs":{"type":"integer","title":"Distr Npcs","default":1},"distr_bs":{"type":"integer","title":"Distr Bs","default":4096},"distr_chunk_bs":{"type":"integer","title":"Distr Chunk Bs","default":4096},"ha_type":{"type":"string","enum":["single","ha"],"title":"Ha Type","default":"single"},"qpair_count":{"type":"integer","title":"Qpair Count","default":256},"max_queue_size":{"type":"integer","title":"Max Queue Size","default":128},"inflight_io_threshold":{"type":"integer","title":"Inflight Io Threshold","default":4},"enable_node_affinity":{"type":"boolean","title":"Enable Node Affinity","default":false},"strict_node_anti_affinity":{"type":"boolean","title":"Strict Node Anti Affinity","default":false}},"type":"object","title":"ClusterParams"},"DeviceDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"size":{"type":"integer","title":"Size"},"io_error":{"type":"boolean","title":"Io Error"},"is_partition":{"type":"boolean","title":"Is Partition"},"nvmf_ips":{"items":{"type":"string","format":"ipv4"},"type":"array","title":"Nvmf Ips"},"nvmf_nqn":{"type":"string","title":"Nvmf Nqn","default":""},"nvmf_port":{"type":"integer","title":"Nvmf Port","default":0}},"type":"object","required":["id","status","health_check","size","io_error","is_partition","nvmf_ips"],"title":"DeviceDTO"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ManagementNodeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"hostname":{"type":"string","title":"Hostname"},"ip":{"type":"string","format":"ipv4","title":"Ip"}},"type":"object","required":["id","status","hostname","ip"],"title":"ManagementNodeDTO"},"RootModel_Union__CreateParams___CloneParams__":{"anyOf":[{"$ref":"#/components/schemas/_CreateParams"},{"$ref":"#/components/schemas/_CloneParams"}],"title":"RootModel[Union[_CreateParams, _CloneParams]]"},"SnapshotDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"used_size":{"type":"integer","minimum":0.0,"title":"Used Size"},"lvol":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Lvol"}},"type":"object","required":["id","name","status","health_check","size","used_size","lvol"],"title":"SnapshotDTO"},"StorageNodeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"ip":{"type":"string","format":"ipv4","title":"Ip"}},"type":"object","required":["id","status","ip"],"title":"StorageNodeDTO"},"StorageNodeParams":{"properties":{"node_address":{"type":"string","title":"Node Address","default":"^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$"},"interface_name":{"type":"string","title":"Interface Name"},"max_snapshots":{"type":"integer","title":"Max Snapshots","default":500},"ha_jm":{"type":"boolean","title":"Ha Jm","default":true},"test_device":{"type":"boolean","title":"Test Device","default":false},"spdk_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Spdk Image"},"spdk_debug":{"type":"boolean","title":"Spdk Debug","default":false},"full_page_unmap":{"type":"boolean","title":"Full Page Unmap","default":false},"data_nics":{"items":{"type":"string"},"type":"array","title":"Data Nics","default":[]},"namespace":{"type":"string","title":"Namespace","default":"default"},"jm_percent":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Jm Percent","default":3},"partitions":{"type":"integer","title":"Partitions","default":1},"iobuf_small_pool_count":{"type":"integer","title":"Iobuf Small Pool Count","default":0},"iobuf_large_pool_count":{"type":"integer","title":"Iobuf Large Pool Count","default":0},"ha_jm_count":{"type":"integer","title":"Ha Jm Count","default":3}},"type":"object","required":["interface_name","spdk_image"],"title":"StorageNodeParams"},"StoragePoolDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","enum":["active","inactive"],"title":"Status"},"max_size":{"type":"integer","minimum":0.0,"title":"Max Size"},"volume_max_size":{"type":"integer","minimum":0.0,"title":"Volume Max Size"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops"},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes"},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes"},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes"}},"type":"object","required":["id","name","status","max_size","volume_max_size","max_rw_iops","max_rw_mbytes","max_r_mbytes","max_w_mbytes"],"title":"StoragePoolDTO"},"StoragePoolParams":{"properties":{"name":{"type":"string","title":"Name"},"pool_max":{"type":"integer","minimum":0.0,"title":"Pool Max","default":0},"volume_max_size":{"type":"integer","minimum":0.0,"title":"Volume Max Size","default":0},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0}},"type":"object","required":["name"],"title":"StoragePoolParams"},"UpdatableClusterParameters":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"}},"type":"object","title":"UpdatableClusterParameters"},"UpdatableLVolParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0},"size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Size"}},"type":"object","title":"UpdatableLVolParams"},"UpdatableStoragePoolParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"max_size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Size"},"volume_max_size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Volume Max Size"},"max_rw_iops":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Rw Iops"},"max_rw_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Rw Mbytes"},"max_r_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max R Mbytes"},"max_w_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max W Mbytes"}},"type":"object","title":"UpdatableStoragePoolParams"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VolumeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"nqn":{"type":"string","title":"Nqn"},"nodes":{"items":{"type":"string"},"type":"array","title":"Nodes"},"port":{"type":"integer","exclusiveMaximum":65536.0,"minimum":0.0,"title":"Port"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"cloned_from":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cloned From"},"crypto_key":{"anyOf":[{"prefixItems":[{"type":"string"},{"type":"string"}],"type":"array","maxItems":2,"minItems":2},{"type":"null"}],"title":"Crypto Key"},"high_availability":{"type":"boolean","title":"High Availability"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops"},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes"},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes"},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes"}},"type":"object","required":["id","name","status","health_check","nqn","nodes","port","size","cloned_from","crypto_key","high_availability","max_rw_iops","max_rw_mbytes","max_r_mbytes","max_w_mbytes"],"title":"VolumeDTO"},"_CloneParams":{"properties":{"name":{"type":"string","title":"Name"},"snapshot_id":{"anyOf":[{"type":"string","pattern":"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"},{"type":"null"}],"title":"Snapshot Id"},"size":{"type":"integer","minimum":0.0,"title":"Size","default":0}},"type":"object","required":["name","snapshot_id"],"title":"_CloneParams"},"_CreateParams":{"properties":{"name":{"type":"string","title":"Name"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"crypto_key":{"anyOf":[{"prefixItems":[{"type":"string"},{"type":"string"}],"type":"array","maxItems":2,"minItems":2},{"type":"null"}],"title":"Crypto Key"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0},"ha_type":{"anyOf":[{"type":"string","enum":["single","ha"]},{"type":"null"}],"title":"Ha Type"},"host_id":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Host Id"},"priority_class":{"type":"integer","enum":[0,1],"title":"Priority Class","default":0},"namespace":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Namespace"},"pvc_name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Pvc Name"},"ndcs":{"type":"integer","minimum":0.0,"title":"Ndcs","default":0},"npcs":{"type":"integer","minimum":0.0,"title":"Npcs","default":0}},"type":"object","required":["name","size"],"title":"_CreateParams"},"_RestartParams":{"properties":{"force":{"type":"boolean","title":"Force","default":false},"reattach_volume":{"type":"boolean","title":"Reattach Volume","default":false}},"type":"object","title":"_RestartParams"},"_SnapshotParams":{"properties":{"name":{"type":"string","title":"Name"}},"type":"object","required":["name"],"title":"_SnapshotParams"},"_UpdateParams":{"properties":{"management_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Management Image"},"spdk_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Spdk Image"},"restart":{"type":"boolean","title":"Restart","default":false}},"type":"object","required":["management_image","spdk_image"],"title":"_UpdateParams"}},"securitySchemes":{"HTTPBearer":{"type":"http","scheme":"bearer"}}}}
\ No newline at end of file

From 741f44f0be79aac6fd510a443c4c7e0284ff0cee Mon Sep 17 00:00:00 2001
From: Alexander Sheredin <alexander@simplyblock.io>
Date: Thu, 8 Jan 2026 20:10:46 +0300
Subject: [PATCH 67/68] merge from main

---
 LICENSE                                       |   21 +
 docker/Dockerfile                             |   19 +-
 docker/Dockerfile_base                        |    1 -
 docs/talos.md                                 |   14 -
 e2e/__init__.py                               |    8 +-
 e2e/continuous_log_collector.py               |    5 +-
 e2e/e2e_tests/cluster_test_base.py            |   10 +-
 e2e/e2e_tests/single_node_multi_fio_perf.py   |    9 +-
 .../continuous_failover_ha_multi_client.py    |  131 +-
 ...s_failover_ha_multi_client_quick_outage.py |  534 +++++++
 .../continuous_failover_ha_multi_outage.py    |  398 ++++-
 e2e/utils/ssh_utils.py                        | 1390 ++++++++++++-----
 requirements.txt                              |    1 +
 simplyblock_core/cluster_ops.py               |  144 +-
 simplyblock_core/constants.py                 |    5 +-
 .../controllers/cluster_events.py             |   10 +
 .../controllers/device_controller.py          |   18 +-
 .../controllers/health_controller.py          |    4 +-
 .../controllers/lvol_controller.py            |   27 +-
 .../controllers/pool_controller.py            |   20 +-
 .../controllers/storage_events.py             |   24 +
 .../controllers/tasks_controller.py           |   23 +
 simplyblock_core/env_var                      |    2 +-
 simplyblock_core/mgmt_node_ops.py             |   10 +-
 simplyblock_core/models/job_schedule.py       |    1 +
 simplyblock_core/models/storage_node.py       |   23 +-
 simplyblock_core/prom_client.py               |  130 ++
 simplyblock_core/rpc_client.py                |    4 +-
 simplyblock_core/scripts/charts/Chart.yaml    |    5 -
 .../scripts/charts/templates/app_k8s.yaml     |  129 +-
 .../templates/csi-hostpath-driverinfo.yaml    |   24 +
 .../charts/templates/csi-hostpath-plugin.yaml |  233 +++
 .../charts/templates/foundationdb.yaml        |   55 +-
 .../scripts/charts/templates/mongodb.yaml     |    4 +-
 .../charts/templates/monitoring_k8s.yaml      |    2 +
 .../charts/templates/storage_class.yaml       |   19 +-
 .../scripts/charts/values-template.yaml       |  194 ---
 simplyblock_core/scripts/charts/values.yaml   |   13 +-
 .../scripts/docker-compose-swarm.yml          |   14 +
 simplyblock_core/scripts/install_deps.sh      |    6 +-
 .../services/capacity_and_stats_collector.py  |   15 +
 simplyblock_core/services/lvol_monitor.py     |   18 +-
 .../services/lvol_stat_collector.py           |   11 +
 simplyblock_core/services/snapshot_monitor.py |    5 +-
 .../services/storage_node_monitor.py          |   14 +-
 .../services/tasks_runner_failed_migration.py |    2 +-
 .../services/tasks_runner_jc_comp.py          |    2 +
 .../services/tasks_runner_migration.py        |    2 +-
 .../tasks_runner_new_dev_migration.py         |    2 +-
 .../services/tasks_runner_port_allow.py       |   30 +-
 .../services/tasks_runner_sync_lvol_del.py    |   77 +
 simplyblock_core/snode_client.py              |   14 +-
 simplyblock_core/storage_node_ops.py          |  235 +--
 simplyblock_core/utils/__init__.py            |  154 +-
 .../api/internal/storage_node/docker.py       |    1 +
 .../api/internal/storage_node/kubernetes.py   |   55 +-
 simplyblock_web/api/v1/__init__.py            |   31 +
 simplyblock_web/api/v1/cluster.py             |   48 +
 simplyblock_web/api/v1/pool.py                |   35 +-
 simplyblock_web/api/v2/__init__.py            |    3 +-
 simplyblock_web/api/v2/cluster.py             |   31 +-
 simplyblock_web/api/v2/device.py              |   18 +-
 simplyblock_web/api/v2/dtos.py                |   54 +-
 simplyblock_web/api/v2/pool.py                |   30 +-
 simplyblock_web/api/v2/storage_node.py        |   29 +-
 simplyblock_web/api/v2/task.py                |    2 +
 simplyblock_web/api/v2/volume.py              |   19 +-
 simplyblock_web/auth_middleware.py            |    4 +
 simplyblock_web/node_utils_k8s.py             |   19 +-
 simplyblock_web/static/openapi.json           |    1 +
 .../oc_storage_core_isolation.yaml.j2         |   15 +-
 .../templates/storage_deploy_spdk.yaml.j2     |   12 +-
 simplyblock_web/utils.py                      |    1 +
 73 files changed, 3511 insertions(+), 1162 deletions(-)
 create mode 100644 LICENSE
 create mode 100644 e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py
 create mode 100644 simplyblock_core/prom_client.py
 create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
 create mode 100644 simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml
 delete mode 100644 simplyblock_core/scripts/charts/values-template.yaml
 create mode 100644 simplyblock_core/services/tasks_runner_sync_lvol_del.py
 create mode 100644 simplyblock_web/static/openapi.json

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..37d1834ca
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023-2025 simplyblock GmbH
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docker/Dockerfile b/docker/Dockerfile
index ce1a83ae1..c8999b47d 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,12 +1,29 @@
 # syntax=docker/dockerfile:1
 FROM simplyblock/simplyblock:base_image
 
+LABEL name="simplyblock"
+LABEL vendor="Simplyblock"
+LABEL version="1.0.0"
+LABEL release="1"
+LABEL summary="Simplyblock controlplane plane component"
+LABEL description="Simplyblock controlplane plane container"
+LABEL maintainer="developers@simplyblock.io"
+
+COPY LICENSE /licenses/LICENSE
+
 WORKDIR /app
 
 COPY requirements.txt .
 
-RUN pip3 install -r requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
+
 
 COPY . /app
 
 RUN python setup.py install
+
+RUN if [ -d /usr/share/terminfo ]; then \
+       find /usr/share/terminfo -lname '*ncr260vt300wpp*' -exec rm -f {} + ; \
+       rm -f /usr/share/terminfo/n/ncr260vt300wpp || true ; \
+    fi
+
diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base
index 201d92759..735d331b1 100644
--- a/docker/Dockerfile_base
+++ b/docker/Dockerfile_base
@@ -39,4 +39,3 @@ COPY requirements.txt requirements.txt
 
 RUN pip3 install -r requirements.txt
 
-RUN rm -rf /usr/share/terminfo/
diff --git a/docs/talos.md b/docs/talos.md
index 47ff817d5..f1406ef38 100644
--- a/docs/talos.md
+++ b/docs/talos.md
@@ -19,26 +19,12 @@ kubectl label namespace simplyblock \
   --overwrite
 ```
 
-
-Patch the host machine so that OpenEBS could work
-
 Create a machine config patch with the contents below and save as patch.yaml
 ```
 cat > patch.yaml <<'EOF'
 machine:
   sysctls:
     vm.nr_hugepages: "1024"
-  nodeLabels:
-    openebs.io/engine: mayastor
-  kubelet:
-    extraMounts:
-      - destination: /var/openebs/local
-        type: bind
-        source: /var/openebs/local
-        options:
-          - rbind
-          - rshared
-          - rw
 EOF
 
 talosctl -e <endpoint ip/hostname> -n <node ip/hostname> patch mc -p @patch.yaml
diff --git a/e2e/__init__.py b/e2e/__init__.py
index e8cae33f7..31164238e 100644
--- a/e2e/__init__.py
+++ b/e2e/__init__.py
@@ -55,6 +55,7 @@
 from stress_test.continuous_failover_ha_geomtery import RandomMultiGeometryFailoverTest
 from stress_test.continuous_failover_ha_2node import RandomMultiClient2NodeFailoverTest
 from stress_test.continuous_failover_ha_rdma import RandomRDMAFailoverTest
+from stress_test.continuous_failover_ha_multi_client_quick_outage import RandomRapidFailoverNoGap
 
 
 from e2e_tests.upgrade_tests.major_upgrade import TestMajorUpgrade
@@ -96,8 +97,8 @@ def get_all_tests(custom=True, ha_test=False):
         TestLvolFioNpcs0,
         TestLvolFioNpcs1,
         TestLvolFioNpcs2,
-        TestLvolFioQOSBW,
-        TestLvolFioQOSIOPS,
+        # TestLvolFioQOSBW,
+        # TestLvolFioQOSIOPS,
         TestSingleNodeOutage,
         # TestSingleNodeReboot,
         # TestHASingleNodeReboot,
@@ -147,6 +148,7 @@ def get_stress_tests():
         RandomMultiGeometryFailoverTest,
         RandomMultiClient2NodeFailoverTest,
         RandomRDMAFailoverTest,
+        RandomRapidFailoverNoGap,
     ]
     return tests
 
@@ -161,4 +163,4 @@ def get_load_tests():
     tests = [
         TestLvolOutageLoadTest
     ]
-    return tests
\ No newline at end of file
+    return tests
diff --git a/e2e/continuous_log_collector.py b/e2e/continuous_log_collector.py
index 48f06fd80..d1ea68c38 100644
--- a/e2e/continuous_log_collector.py
+++ b/e2e/continuous_log_collector.py
@@ -1,6 +1,5 @@
 import os
 from datetime import datetime
-from pathlib import Path
 from utils.ssh_utils import SshUtils, RunnerK8sLog
 from logger_config import setup_logger
 
@@ -22,7 +21,7 @@ def __init__(self,docker_logs_path=None):
 
     def get_log_directory(self):
         timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
-        return os.path.join(Path.home(), "container-logs", f"manual-logs-{timestamp}")
+        return os.path.join('/mnt/nfs_share/', f"snapshot-repliction-from-replicated-clone-{timestamp}")
 
     def collect_logs(self, test_name):
         all_nodes = set()
@@ -75,4 +74,4 @@ def collect_logs(self, test_name):
 
 if __name__ == "__main__":
     collector = ContinuousLogCollector()
-    collector.collect_logs(test_name="Manual")
+    collector.collect_logs(test_name="snapshot-repliction-from-replicated-clone")
diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py
index 5077544b0..d37222c88 100644
--- a/e2e/e2e_tests/cluster_test_base.py
+++ b/e2e/e2e_tests/cluster_test_base.py
@@ -401,13 +401,17 @@ def collect_management_details(self, post_teardown=False):
             cmd = f"{self.base_cmd} sn check {result['uuid']} >& {base_path}/node{node}_check{suffix}.txt"
             self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd)
 
+            cmd = f"{self.base_cmd} sn get {result['uuid']} >& {base_path}/node{node}_get{suffix}.txt"
+            self.ssh_obj.exec_command(self.mgmt_nodes[0], cmd)
+
             node+=1
-        for node in self.fio_node:
+        all_nodes = self.storage_nodes + self.mgmt_nodes + self.client_machines
+        for node in all_nodes:
             base_path = os.path.join(self.docker_logs_path, node)
-            cmd = f"journalctl -k >& {base_path}/jounalctl_{node}.txt"
+            cmd = f"journalctl -k --no-tail >& {base_path}/jounalctl_{node}-final.txt"
 
             self.ssh_obj.exec_command(node, cmd)
-            cmd = f"dmesg -T >& {base_path}/dmesg_{node}.txt"
+            cmd = f"dmesg -T >& {base_path}/dmesg_{node}-final.txt"
             self.ssh_obj.exec_command(node, cmd)
             
     def teardown(self, delete_lvols=True, close_ssh=True):
diff --git a/e2e/e2e_tests/single_node_multi_fio_perf.py b/e2e/e2e_tests/single_node_multi_fio_perf.py
index 86a75c4d5..681cc1742 100644
--- a/e2e/e2e_tests/single_node_multi_fio_perf.py
+++ b/e2e/e2e_tests/single_node_multi_fio_perf.py
@@ -187,10 +187,11 @@ def cleanup_lvols(self, lvol_configs):
         self.logger.info("Starting cleanup of LVOLs")
         for config in lvol_configs:
             lvol_name = config['lvol_name']
-            self.ssh_obj.unmount_path(node=self.client_machines[0],
-                                      device=self.lvol_devices[lvol_name]['MountPath'])
-            self.ssh_obj.remove_dir(node=self.client_machines[0], 
-                                    dir_path=self.lvol_devices[lvol_name]['MountPath'])
+            if config['mount']:
+                self.ssh_obj.unmount_path(node=self.client_machines[0],
+                                          device=self.lvol_devices[lvol_name]['MountPath'])
+                self.ssh_obj.remove_dir(node=self.client_machines[0], 
+                                        dir_path=self.lvol_devices[lvol_name]['MountPath'])
             lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name)
             subsystems = self.ssh_obj.get_nvme_subsystems(node=self.client_machines[0], 
                                                           nqn_filter=lvol_id)
diff --git a/e2e/stress_test/continuous_failover_ha_multi_client.py b/e2e/stress_test/continuous_failover_ha_multi_client.py
index a2869482d..0f0c9f94e 100644
--- a/e2e/stress_test/continuous_failover_ha_multi_client.py
+++ b/e2e/stress_test/continuous_failover_ha_multi_client.py
@@ -42,6 +42,7 @@ def __init__(self, **kwargs):
         self.sn_nodes = []
         self.current_outage_node = None
         self.snapshot_names = []
+        self.current_outage_nodes = []
         self.disconnect_thread = None
         self.outage_start_time = None
         self.outage_end_time = None
@@ -60,8 +61,7 @@ def __init__(self, **kwargs):
         # self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt",
         #                      "interface_partial_network_interrupt",
         #                      "partial_nw"]
-        self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt",
-                             "interface_partial_network_interrupt"]
+        self.outage_types = ["graceful_shutdown", "container_stop", "interface_full_network_interrupt"]
         # self.outage_types = ["partial_nw"]
         self.blocked_ports = None
         self.outage_log_file = os.path.join("logs", f"outage_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
@@ -111,7 +111,26 @@ def create_lvols_with_fio(self, count):
                 lvol_name = f"{self.lvol_name}_{i}" if not is_crypto else f"c{self.lvol_name}_{i}"
             self.logger.info(f"Creating lvol with Name: {lvol_name}, fs type: {fs_type}, crypto: {is_crypto}")
             try:
-                if self.current_outage_node:
+                self.logger.info(f"Current Outage Node: {self.current_outage_nodes}")
+                if self.current_outage_nodes:
+                    self.logger.info(f"Primary vs secondary: {self.sn_primary_secondary_map}")
+                    skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] in self.current_outage_nodes]
+                    self.logger.info(f"Skip Nodes: {skip_nodes}")
+                    for node in self.current_outage_nodes:
+                        skip_nodes.append(node)
+                    self.logger.info(f"Skip Nodes: {skip_nodes}")
+                    self.logger.info(f"Storage Nodes with sec: {self.sn_nodes_with_sec}")
+                    host_id = [node for node in self.sn_nodes_with_sec if node not in skip_nodes]
+                    self.sbcli_utils.add_lvol(
+                        lvol_name=lvol_name,
+                        pool_name=self.pool_name,
+                        size=self.lvol_size,
+                        crypto=is_crypto,
+                        key1=self.lvol_crypt_keys[0],
+                        key2=self.lvol_crypt_keys[1],
+                        host_id=host_id[0]
+                    )
+                elif self.current_outage_node:
                     skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] == self.current_outage_node]
                     skip_nodes.append(self.current_outage_node)
                     skip_nodes.append(self.sn_primary_secondary_map[self.current_outage_node])
@@ -276,7 +295,7 @@ def create_lvols_with_fio(self, count):
                     "iodepth": 1,
                     "numjobs": 5,
                     "time_based": True,
-                    "runtime": 2000,
+                    "runtime": 3000,
                     "log_avg_msec": 1000,
                     "iolog_file": self.lvol_mount_details[lvol_name]["iolog_base_path"],
                 },
@@ -306,11 +325,11 @@ def perform_random_outage(self):
         node_ip = node_details[0]["mgmt_ip"]
         node_rpc_port = node_details[0]["rpc_port"]
 
-        sleep_n_sec(120)
+        sleep_n_sec(5)
         for node in self.sn_nodes_with_sec:
-            self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
-                                      storage_node_id=node)
-        
+            # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
+            #                          storage_node_id=node)
+            self.logger.info("Skipping lvstore dump!!")
         for node in self.sn_nodes_with_sec:
             cur_node_details = self.sbcli_utils.get_storage_node_details(node)
             cur_node_ip = cur_node_details[0]["mgmt_ip"]
@@ -417,7 +436,7 @@ def perform_random_outage(self):
             
             self.disconnect_thread = threading.Thread(
                 target=self.ssh_obj.disconnect_all_active_interfaces,
-                args=(node_ip, active_interfaces, 600),
+                args=(node_ip, active_interfaces, 300),
             )
             self.disconnect_thread.start()
         elif outage_type == "interface_partial_network_interrupt":
@@ -430,7 +449,7 @@ def perform_random_outage(self):
             
             self.disconnect_thread = threading.Thread(
                 target=self.ssh_obj.disconnect_all_active_interfaces,
-                args=(node_ip, active_interfaces, 600),
+                args=(node_ip, active_interfaces, 300),
             )
             self.disconnect_thread.start()
         elif outage_type == "partial_nw":
@@ -478,12 +497,12 @@ def perform_random_outage(self):
                 self.ssh_obj.disconnect_lvol_node_device(node=self.lvol_mount_details[lvol]["Client"], device=self.lvol_mount_details[lvol]["Device"])
             
         if outage_type != "partial_nw" or outage_type != "partial_nw_single_port":
-            sleep_n_sec(120)
+            sleep_n_sec(10)
         
         return outage_type
     
     
-    def restart_nodes_after_failover(self, outage_type):
+    def restart_nodes_after_failover(self, outage_type, restart=False):
         """Perform steps for node restart."""
         node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
         node_ip = node_details[0]["mgmt_ip"]
@@ -543,14 +562,48 @@ def restart_nodes_after_failover(self, outage_type):
                 self.ssh_obj.exec_command(node=self.lvol_mount_details[lvol]["Client"], command=connect)
         
         elif outage_type == "container_stop":
-            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
-            # Log the restart event
-            self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=1)
+            if restart:
+                max_retries = 10
+                retry_delay = 10  # seconds
+
+                # Retry mechanism for restarting the node
+                for attempt in range(max_retries):
+                    try:
+                        force=False
+                        if attempt == max_retries - 1:
+                            force=True
+                            self.logger.info("[CHECK] Restarting Node via CLI with Force flag as via API Fails.")
+                        else:
+                            self.logger.info("[CHECK] Restarting Node via CLI as via API Fails.")
+                        self.ssh_obj.restart_node(node=self.mgmt_nodes[0],
+                                                node_id=self.current_outage_node,
+                                                force=force)
+                        # else:
+                        #     self.sbcli_utils.restart_node(node_uuid=self.current_outage_node, expected_error_code=[503])
+                        self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
+                        break  # Exit loop if successful
+                    except Exception as _:
+                        if attempt < max_retries - 2:
+                            self.logger.info(f"Attempt {attempt + 1} failed to restart node. Retrying in {retry_delay} seconds...")
+                            sleep_n_sec(retry_delay)
+                        elif attempt < max_retries - 1:
+                            self.logger.info(f"Attempt {attempt + 1} failed to restart node via API. Retrying in {retry_delay} seconds via CMD...")
+                            sleep_n_sec(retry_delay)
+                        else:
+                            self.logger.info("Max retries reached. Failed to restart node.")
+                            raise  # Rethrow the last exception
+                self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
+                # Log the restart event
+                self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=0)
+            else:
+                self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
+                # Log the restart event
+                self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=2)
 
         elif "network_interrupt" in outage_type:
             self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=1000)
             # Log the restart event
-            self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=11)
+            self.log_outage_event(self.current_outage_node, outage_type, "Node restarted", outage_time=6)
         
         if not self.k8s_test:
             for node in self.storage_nodes:
@@ -608,9 +661,9 @@ def restart_nodes_after_failover(self, outage_type):
             # sleep_n_sec(30)
 
         for node in self.sn_nodes_with_sec:
-            self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
-                                      storage_node_id=node)
-
+            # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
+            #                          storage_node_id=node)
+            self.logger.info("Skipping lvstore dump!!")
 
     def create_snapshots_and_clones(self):
         """Create snapshots and clones during an outage."""
@@ -777,7 +830,7 @@ def create_snapshots_and_clones(self):
                     "iodepth": 1,
                     "numjobs": 5,
                     "time_based": True,
-                    "runtime": 2000,
+                    "runtime": 3000,
                     "log_avg_msec": 1000,
                     "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"],
                 },
@@ -786,22 +839,23 @@ def create_snapshots_and_clones(self):
             self.fio_threads.append(fio_thread)
             self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}.")
 
-            self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"],
-                                         new_size=f"{self.int_lvol_size}G")
+            if self.lvol_mount_details[lvol]["ID"]:
+                self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"],
+                                             new_size=f"{self.int_lvol_size}G")
             sleep_n_sec(10)
-            self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"],
-                                         new_size=f"{self.int_lvol_size}G")
-            
+            if self.clone_mount_details[clone_name]["ID"]:
+                self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"],
+                                             new_size=f"{self.int_lvol_size}G")
+
 
     def delete_random_lvols(self, count):
         """Delete random lvols during an outage."""
         skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] == self.current_outage_node]
         skip_nodes.append(self.current_outage_node)
         skip_nodes.append(self.sn_primary_secondary_map[self.current_outage_node])
-        skip_nodes_lvol = []
-        self.logger.info(f"Skipping Nodes: {skip_nodes_lvol}")
+        self.logger.info(f"Skipping Nodes: {skip_nodes}")
         available_lvols = [
-            lvol for node, lvols in self.node_vs_lvol.items() if node not in skip_nodes_lvol for lvol in lvols
+            lvol for node, lvols in self.node_vs_lvol.items() if node not in skip_nodes for lvol in lvols
         ]
         self.logger.info(f"Available Lvols: {available_lvols}")
         if len(available_lvols) < count:
@@ -922,7 +976,7 @@ def perform_failover_during_outage(self):
                     storage_node_id=node,
                     logs_path=self.docker_logs_path
                 )
-            self.create_lvols_with_fio(3)
+            self.create_lvols_with_fio(5)
             if not self.k8s_test:
                 for node in self.storage_nodes:
                     self.ssh_obj.restart_docker_logging(
@@ -1041,7 +1095,7 @@ def restart_fio(self, iteration):
                     "iodepth": 1,
                     "numjobs": 5,
                     "time_based": True,
-                    "runtime": 2000,
+                    "runtime": 3000,
                     "log_avg_msec": 1000,
                     "iolog_file": self.lvol_mount_details[lvol]["iolog_base_path"],
                 },
@@ -1150,7 +1204,7 @@ def run(self):
                         storage_node_id=node,
                         logs_path=self.docker_logs_path
                     )
-                self.create_lvols_with_fio(5)
+                self.create_lvols_with_fio(3)
                 if not self.k8s_test:
                     for node in self.storage_nodes:
                         self.ssh_obj.restart_docker_logging(
@@ -1175,7 +1229,7 @@ def run(self):
             else:
                 self.logger.info(f"Current outage node: {self.current_outage_node} is secondary node. Skipping delete and create")
             if outage_type != "partial_nw" or outage_type != "partial_nw_single_port":
-                sleep_n_sec(280)
+                sleep_n_sec(100)
             for node in self.sn_nodes_with_sec:
                 cur_node_details = self.sbcli_utils.get_storage_node_details(node)
                 cur_node_ip = cur_node_details[0]["mgmt_ip"]
@@ -1195,7 +1249,7 @@ def run(self):
                 )
             self.logger.info("Waiting for fallback.")
             if outage_type != "partial_nw" or outage_type != "partial_nw_single_port":
-                sleep_n_sec(100)
+                sleep_n_sec(15)
             time_duration = self.common_utils.calculate_time_duration(
                 start_timestamp=self.outage_start_time,
                 end_timestamp=self.outage_end_time
@@ -1213,23 +1267,24 @@ def run(self):
             no_task_ok = outage_type in {"partial_nw", "partial_nw_single_port", "lvol_disconnect_primary"}
             if not self.sbcli_utils.is_secondary_node(self.current_outage_node):
                 self.validate_migration_for_node(self.outage_start_time, 2000, None, 60, no_task_ok=no_task_ok)
+                # pass
 
             for clone, clone_details in self.clone_mount_details.items():
                 self.common_utils.validate_fio_test(clone_details["Client"],
                                                     log_file=clone_details["Log"])
-                # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"])
-                # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"])
+                self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"])
+                self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"])
             
             for lvol, lvol_details in self.lvol_mount_details.items():
                 self.common_utils.validate_fio_test(lvol_details["Client"],
                                                     log_file=lvol_details["Log"])
-                # self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"])
-                # self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"])
+                self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"])
+                self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"])
 
             # Perform failover and manage resources during outage
             outage_type = self.perform_failover_during_outage()
             if outage_type != "partial_nw" or outage_type != "partial_nw_single_port":
-                sleep_n_sec(100)
+                sleep_n_sec(15)
             time_duration = self.common_utils.calculate_time_duration(
                 start_timestamp=self.outage_start_time,
                 end_timestamp=self.outage_end_time
diff --git a/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py
new file mode 100644
index 000000000..c2c1051a2
--- /dev/null
+++ b/e2e/stress_test/continuous_failover_ha_multi_client_quick_outage.py
@@ -0,0 +1,534 @@
+# stress_test/continuous_failover_ha_multi_client_quick_outage.py
+# Fast outages with long-running FIO, no churn beyond initial setup.
+# - Create lvols, snapshots, clones ONCE at the beginning
+# - Start 30min FIO on all mounts (lvols + clones)
+# - Run fast outages (as soon as node is ONLINE again)
+# - Every 5 outages: wait for all FIO to complete, validate, then (optionally) wait for migration window
+# - Graceful shutdown: suspend -> wait SUSPENDED -> shutdown -> wait OFFLINE -> keep offline 5 min -> restart
+# - After any restart: 15–30s idle then immediately next outage
+
+import os
+import random
+import string
+import threading
+from datetime import datetime
+from utils.common_utils import sleep_n_sec
+from exceptions.custom_exception import LvolNotConnectException
+from stress_test.lvol_ha_stress_fio import TestLvolHACluster
+
+
+def _rand_id(n=15, first_alpha=True):
+    letters = string.ascii_uppercase
+    digits = string.digits
+    allc = letters + digits
+    if first_alpha:
+        return random.choice(letters) + ''.join(random.choices(allc, k=n-1))
+    return ''.join(random.choices(allc, k=n))
+
+
+class RandomRapidFailoverNoGap(TestLvolHACluster):
+    """
+    - Minimal churn (only bootstrap creates)
+    - Long FIO (30 mins) on every lvol/clone
+    - Outage pacing: next outage right after ONLINE; add 15–30s buffer post-restart
+    - Validate FIO and pause for migration every 5 outages
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # Base knobs
+        self.total_lvols = 20
+        self.lvol_size = "40G"
+        self.fio_size = "15G"
+
+        # Validation cadence & FIO runtime
+        self.validate_every = 5
+        self._iter = 0
+        self._per_wave_fio_runtime = 3600      # 60 minutes
+        self._fio_wait_timeout = 5000          # wait for all to finish
+
+        # Internal state
+        self.fio_threads = []
+        self.lvol_mount_details = {}
+        self.clone_mount_details = {}
+        self.sn_nodes = []
+        self.sn_nodes_with_sec = []
+        self.sn_primary_secondary_map = {}
+        self.node_vs_lvol = {}
+        self.snapshot_names = []
+        self.snap_vs_node = {}
+        self.current_outage_node = None
+        self.outage_start_time = None
+        self.outage_end_time = None
+        self.first_outage_ts = None            # track the first outage for migration window
+        self.test_name = "longfio_nochurn_rapid_outages"
+
+        self.outage_types = [
+            "graceful_shutdown",
+            "container_stop",
+            # "interface_full_network_interrupt",
+        ]
+
+        # Names
+        self.lvol_base = f"lvl{_rand_id(12)}"
+        self.clone_base = f"cln{_rand_id(12)}"
+        self.snap_base = f"snap{_rand_id(12)}"
+
+        # Logging file for outages
+        self.outage_log_file = os.path.join("logs", f"outage_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+        self._init_outage_log()
+
+    # ---------- small utilities ----------
+
+    def _init_outage_log(self):
+        os.makedirs(os.path.dirname(self.outage_log_file), exist_ok=True)
+        with open(self.outage_log_file, "w") as f:
+            f.write("Timestamp,Node,Outage_Type,Event\n")
+
+    def _log_outage_event(self, node, outage_type, event):
+        ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        with open(self.outage_log_file, "a") as f:
+            f.write(f"{ts},{node},{outage_type},{event}\n")
+
+    def _short_bs(self):
+        # return f"{2 ** random.randint(2, 7)}K"  # 4K–128K
+        return f"{2 ** 6}K"
+
+    def _pick_outage(self):
+        random.shuffle(self.outage_types)
+        return self.outage_types[0]
+
+    # ---------- cluster bootstrap ----------
+
+    def _wait_cluster_active(self, timeout=900, poll=5):
+        """
+        Poll `sbctl cluster list` until status ACTIVE.
+        Avoids 400 in_activation when creating lvol/snap/clone during bring-up.
+        """
+        end = datetime.now().timestamp() + timeout
+        while datetime.now().timestamp() < end:
+            try:
+                info = self.ssh_obj.cluster_list(self.mgmt_nodes[0], self.cluster_id)  # must wrap "sbctl cluster list"
+                self.logger.info(info)
+                # Expect a single row with Status
+                status = str(info).upper()
+                if "ACTIVE" in status:
+                    return
+            except Exception as e:
+                self.logger.info(f"ERROR: {e}")
+            sleep_n_sec(poll)
+        raise RuntimeError("Cluster did not become ACTIVE within timeout")
+
+    def _bootstrap_cluster(self):
+        # Ensure Cluster is ACTIVE
+        self._wait_cluster_active()
+
+        # create pool
+        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+
+        # discover storage nodes
+        storage_nodes = self.sbcli_utils.get_storage_nodes()
+        for res in storage_nodes['results']:
+            self.sn_nodes.append(res["uuid"])
+            self.sn_nodes_with_sec.append(res["uuid"])
+            self.sn_primary_secondary_map[res["uuid"]] = res["secondary_node_id"]
+        
+        self.logger.info(f"[LFNG] SN sec map: {self.sn_primary_secondary_map}")
+
+        # initial lvols + mount + then later clone from snapshots
+        self._create_lvols(count=self.total_lvols)  # start_fio=False → we launch after clones
+        self._seed_snapshots_and_clones()           # also mounts clones
+
+        # Start 30 min FIO on all (lvols + clones)
+        self._kick_fio_for_all(runtime=self._per_wave_fio_runtime)
+
+        # start container logs
+        if not self.k8s_test:
+            for node in self.storage_nodes:
+                self.ssh_obj.restart_docker_logging(
+                    node_ip=node,
+                    containers=self.container_nodes[node],
+                    log_dir=os.path.join(self.docker_logs_path, node),
+                    test_name=self.test_name
+                )
+        else:
+            self.runner_k8s_log.restart_logging()
+
+    # ---------- lvol / fio helpers ----------
+
+    def _create_lvols(self, count=1):
+        for _ in range(count):
+            fs_type = random.choice(["ext4", "xfs"])
+            is_crypto = random.choice([True, False])
+            name_core = f"{self.lvol_base}_{_rand_id(6, first_alpha=False)}"
+            lvol_name = name_core if not is_crypto else f"c{name_core}"
+
+            kwargs = dict(
+                lvol_name=lvol_name,
+                pool_name=self.pool_name,
+                size=self.lvol_size,
+                crypto=is_crypto,
+                key1=self.lvol_crypt_keys[0],
+                key2=self.lvol_crypt_keys[1],
+            )
+
+            # Avoid outage node & partner during initial placement
+            if self.current_outage_node:
+                skip_nodes = [self.current_outage_node, self.sn_primary_secondary_map.get(self.current_outage_node)]
+                skip_nodes += [p for p, s in self.sn_primary_secondary_map.items() if s == self.current_outage_node]
+                host_id = [n for n in self.sn_nodes_with_sec if n not in skip_nodes]
+                if host_id:
+                    kwargs["host_id"] = host_id[0]
+
+            # Ensure cluster ACTIVE before creating
+            self._wait_cluster_active()
+
+            try:
+                self.sbcli_utils.add_lvol(**kwargs)
+            except Exception as e:
+                self.logger.warning(f"[LFNG] lvol create failed ({lvol_name}) → {e}; retry once after ACTIVE gate")
+                self._wait_cluster_active()
+                self.sbcli_utils.add_lvol(**kwargs)
+
+            # record
+            lvol_id = self.sbcli_utils.get_lvol_id(lvol_name)
+            self.lvol_mount_details[lvol_name] = {
+                "ID": lvol_id,
+                "Command": None,
+                "Mount": None,
+                "Device": None,
+                "MD5": None,
+                "FS": fs_type,
+                "Log": f"{self.log_path}/{lvol_name}.log",
+                "snapshots": [],
+                "iolog_base_path": f"{self.log_path}/{lvol_name}_fio_iolog",
+            }
+
+            # refresh list
+            self.ssh_obj.exec_command(node=self.mgmt_nodes[0], command=f"{self.base_cmd} lvol list", supress_logs=True)
+
+            # track node placement
+            lvol_node_id = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)[0]["node_id"]
+            self.node_vs_lvol.setdefault(lvol_node_id, []).append(lvol_name)
+
+            # connect
+            connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=lvol_name)
+            self.lvol_mount_details[lvol_name]["Command"] = connect_ls
+
+            client_node = random.choice(self.fio_node)
+            self.lvol_mount_details[lvol_name]["Client"] = client_node
+
+            initial = self.ssh_obj.get_devices(node=client_node)
+            for c in connect_ls:
+                _, err = self.ssh_obj.exec_command(node=client_node, command=c)
+                if err:
+                    nqn = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)[0]["nqn"]
+                    self.ssh_obj.disconnect_nvme(node=client_node, nqn_grep=nqn)
+                    self.logger.info(f"[LFNG] connect error → clean lvol {lvol_name}")
+                    self.sbcli_utils.delete_lvol(lvol_name=lvol_name, max_attempt=20, skip_error=True)
+                    sleep_n_sec(3)
+                    del self.lvol_mount_details[lvol_name]
+                    self.node_vs_lvol[lvol_node_id].remove(lvol_name)
+                    break
+
+            final = self.ssh_obj.get_devices(node=client_node)
+            new_dev = None
+            for d in final:
+                if d not in initial:
+                    new_dev = f"/dev/{d.strip()}"
+                    break
+            if not new_dev:
+                raise LvolNotConnectException("LVOL did not connect")
+
+            self.lvol_mount_details[lvol_name]["Device"] = new_dev
+            self.ssh_obj.format_disk(node=client_node, device=new_dev, fs_type=fs_type)
+
+            mnt = f"{self.mount_path}/{lvol_name}"
+            self.ssh_obj.mount_path(node=client_node, device=new_dev, mount_path=mnt)
+            self.lvol_mount_details[lvol_name]["Mount"] = mnt
+
+            # clean old logs
+            self.ssh_obj.delete_files(client_node, [
+                f"{mnt}/*fio*",
+                f"{self.log_path}/local-{lvol_name}_fio*",
+                f"{self.log_path}/{lvol_name}_fio_iolog*"
+            ])
+
+    def _seed_snapshots_and_clones(self):
+        """Create one snapshot and one clone per lvol (best effort). Mount clones on same client."""
+        for lvol, det in list(self.lvol_mount_details.items()):
+            # Ensure ACTIVE
+            self._wait_cluster_active()
+
+            snap_name = f"{self.snap_base}_{_rand_id(8, first_alpha=False)}"
+            out, err = self.ssh_obj.add_snapshot(self.mgmt_nodes[0], det["ID"], snap_name)
+            if "(False," in str(out) or "(False," in str(err):
+                self.logger.warning(f"[LFNG] snapshot create failed for {lvol} → skip clone")
+                continue
+
+            self.snapshot_names.append(snap_name)
+            node_id = self.sbcli_utils.get_lvol_details(lvol_id=det["ID"])[0]["node_id"]
+            self.snap_vs_node[snap_name] = node_id
+            det["snapshots"].append(snap_name)
+
+            snap_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snap_name)
+            clone_name = f"{self.clone_base}_{_rand_id(8, first_alpha=False)}"
+            try:
+                self.ssh_obj.add_clone(self.mgmt_nodes[0], snap_id, clone_name)
+            except Exception as e:
+                self.logger.warning(f"[LFNG] clone create failed for {lvol} → {e}")
+                continue
+
+            # connect clone
+            fs_type = det["FS"]
+            client = det["Client"]
+
+            self.clone_mount_details[clone_name] = {
+                "ID": self.sbcli_utils.get_lvol_id(clone_name),
+                "Command": None,
+                "Mount": None,
+                "Device": None,
+                "MD5": None,
+                "FS": fs_type,
+                "Log": f"{self.log_path}/{clone_name}.log",
+                "snapshot": snap_name,
+                "Client": client,
+                "iolog_base_path": f"{self.log_path}/{clone_name}_fio_iolog",
+            }
+
+            connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=clone_name)
+            self.clone_mount_details[clone_name]["Command"] = connect_ls
+
+            initial = self.ssh_obj.get_devices(node=client)
+            for c in connect_ls:
+                _, err = self.ssh_obj.exec_command(node=client, command=c)
+                if err:
+                    nqn = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])[0]["nqn"]
+                    self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn)
+                    self.logger.info("[LFNG] connect clone error → cleanup")
+                    self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True)
+                    sleep_n_sec(3)
+                    del self.clone_mount_details[clone_name]
+                    continue
+
+            final = self.ssh_obj.get_devices(node=client)
+            new_dev = None
+            for d in final:
+                if d not in initial:
+                    new_dev = f"/dev/{d.strip()}"
+                    break
+            if not new_dev:
+                raise LvolNotConnectException("Clone did not connect")
+
+            self.clone_mount_details[clone_name]["Device"] = new_dev
+            if fs_type == "xfs":
+                self.ssh_obj.clone_mount_gen_uuid(client, new_dev)
+            mnt = f"{self.mount_path}/{clone_name}"
+            self.ssh_obj.mount_path(node=client, device=new_dev, mount_path=mnt)
+            self.clone_mount_details[clone_name]["Mount"] = mnt
+
+            # purge old logs
+            self.ssh_obj.delete_files(client, [
+                f"{self.log_path}/local-{clone_name}_fio*",
+                f"{self.log_path}/{clone_name}_fio_iolog*",
+                f"{mnt}/*fio*"
+            ])
+
+    def _kick_fio_for_all(self, runtime=None):
+        """Start verified fio (PID-checked; auto-rerun) for all lvols + clones."""
+        # small stagger to avoid SSH bursts
+        def _launch(name, det):
+            self.ssh_obj.run_fio_test(
+                det["Client"], None, det["Mount"], det["Log"],
+                size=self.fio_size, name=f"{name}_fio", rw="randrw",
+                bs=self._short_bs(), nrfiles=8, iodepth=1, numjobs=2,
+                time_based=True, runtime=runtime, log_avg_msec=1000,
+                iolog_file=det["iolog_base_path"], max_latency="30s",
+                verify="md5", verify_dump=1, verify_fatal=1, retries=6,
+                use_latency=False
+            )
+
+        for lvol, det in self.lvol_mount_details.items():
+            self.ssh_obj.delete_files(det["Client"], [f"/mnt/{lvol}/*"])
+            t = threading.Thread(target=_launch, args=(lvol, det))
+            t.start()
+            self.fio_threads.append(t)
+            sleep_n_sec(0.2)
+
+        for cname, det in self.clone_mount_details.items():
+            self.ssh_obj.delete_files(det["Client"], [f"/mnt/{cname}/*"])
+            t = threading.Thread(target=_launch, args=(cname, det))
+            t.start()
+            self.fio_threads.append(t)
+            sleep_n_sec(0.2)
+
+    # ---------- outage flow ----------
+
+    def _perform_outage(self):
+        random.shuffle(self.sn_nodes)
+        self.current_outage_node = self.sn_nodes[0]
+        outage_type = self._pick_outage()
+
+        if self.first_outage_ts is None:
+            self.first_outage_ts = int(datetime.now().timestamp())
+
+        cur_node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
+        cur_node_ip = cur_node_details[0]["mgmt_ip"]
+        self.ssh_obj.fetch_distrib_logs(
+            storage_node_ip=cur_node_ip,
+            storage_node_id=self.current_outage_node,
+            logs_path=self.docker_logs_path
+        )
+        
+        # self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
+        #                           storage_node_id=self.current_outage_node)
+
+        self.outage_start_time = int(datetime.now().timestamp())
+        self._log_outage_event(self.current_outage_node, outage_type, "Outage started")
+        self.logger.info(f"[LFNG] Outage={outage_type} node={self.current_outage_node}")
+
+        node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
+        node_ip = node_details[0]["mgmt_ip"]
+        node_rpc_port = node_details[0]["rpc_port"]
+
+        if outage_type == "graceful_shutdown":
+            # suspend -> wait SUSPENDED -> shutdown -> wait OFFLINE
+            try:
+                self.logger.info(f"[LFNG] Suspending node via: sbcli-dev sn suspend {self.current_outage_node}")
+                self.sbcli_utils.suspend_node(node_uuid=self.current_outage_node, expected_error_code=[503])
+                self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "suspended", timeout=600)
+            except Exception:
+                self.logger.warning("[LFNG] Suspend failed from API; ignoring if already suspended")
+
+            try:
+                self.sbcli_utils.shutdown_node(node_uuid=self.current_outage_node, force=True, expected_error_code=[503])
+            except Exception:
+                self.ssh_obj.shutdown_node(node=self.mgmt_nodes[0], node_id=self.current_outage_node, force=True)
+            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "offline", timeout=900)
+
+            for node in self.sn_nodes_with_sec:
+                if node != self.current_outage_node:
+                    cur_node_details = self.sbcli_utils.get_storage_node_details(node)
+                    cur_node_ip = cur_node_details[0]["mgmt_ip"]
+                    self.ssh_obj.fetch_distrib_logs(
+                        storage_node_ip=cur_node_ip,
+                        storage_node_id=node,
+                        logs_path=self.docker_logs_path
+                    )
+            # Keep node strictly offline for 5 minutes
+            sleep_n_sec(500)
+
+        elif outage_type == "container_stop":
+            self.ssh_obj.stop_spdk_process(node_ip, node_rpc_port)
+
+        elif outage_type == "interface_full_network_interrupt":
+            # Down all active data interfaces for ~300s (5 minutes) with ping verification
+            active = self.ssh_obj.get_active_interfaces(node_ip)
+            self.ssh_obj.disconnect_all_active_interfaces(node_ip, active, 300)
+            sleep_n_sec(280)
+
+        return outage_type
+
+    def restart_nodes_after_failover(self, outage_type):
+
+        self.logger.info(f"[LFNG] Recover outage={outage_type} node={self.current_outage_node}")
+
+        cur_node_details = self.sbcli_utils.get_storage_node_details(self.sn_primary_secondary_map[self.current_outage_node])
+        cur_node_ip = cur_node_details[0]["mgmt_ip"]
+        self.ssh_obj.fetch_distrib_logs(
+            storage_node_ip=cur_node_ip,
+            storage_node_id=self.sn_primary_secondary_map[self.current_outage_node],
+            logs_path=self.docker_logs_path
+        )
+
+        # Only wait for ONLINE (skip deep health)
+        if outage_type == 'graceful_shutdown':
+            try:
+                self.ssh_obj.restart_node(self.mgmt_nodes[0], node_id=self.current_outage_node, force=True)
+            except Exception:
+                pass
+            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900)
+        elif outage_type == 'container_stop':
+            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900)
+        elif "network_interrupt" in outage_type:
+            self.sbcli_utils.wait_for_storage_node_status(self.current_outage_node, "online", timeout=900)
+
+        self._log_outage_event(self.current_outage_node, outage_type, "Node online")
+        self.outage_end_time = int(datetime.now().timestamp())
+
+        cur_node_details = self.sbcli_utils.get_storage_node_details(self.current_outage_node)
+        cur_node_ip = cur_node_details[0]["mgmt_ip"]
+        self.ssh_obj.fetch_distrib_logs(
+            storage_node_ip=cur_node_ip,
+            storage_node_id=self.current_outage_node,
+            logs_path=self.docker_logs_path
+        )
+
+        # keep container log streaming going
+        if not self.k8s_test:
+            for node in self.storage_nodes:
+                self.ssh_obj.restart_docker_logging(
+                    node_ip=node,
+                    containers=self.container_nodes[node],
+                    log_dir=os.path.join(self.docker_logs_path, node),
+                    test_name=self.test_name
+                )
+        else:
+            self.runner_k8s_log.restart_logging()
+
+        # small cool-down before next outage to reduce SSH churn
+        # sleep_n_sec(random.randint(1, 5))
+
+    # ---------- main ----------
+
+    def run(self):
+        self.logger.info("[LFNG] Starting RandomRapidFailoverNoGap")
+        self._bootstrap_cluster()
+        sleep_n_sec(5)
+
+        iteration = 1
+        while True:
+            outage_type = self._perform_outage()
+            self.restart_nodes_after_failover(outage_type)
+
+            self._iter += 1
+            if self._iter % self.validate_every == 0:
+                self.logger.info(f"[LFNG] {self._iter} outages → wait & validate all FIO")
+                # Join launch threads so we know all jobs issued
+                for t in self.fio_threads:
+                    t.join(timeout=10)
+                self.fio_threads = []
+
+                # Wait for all fio jobs to end (they’re 30min jobs)
+                self.common_utils.manage_fio_threads(self.fio_node, [], timeout=self._fio_wait_timeout)
+
+                for node in self.sn_nodes_with_sec:
+                    cur_node_details = self.sbcli_utils.get_storage_node_details(node)
+                    cur_node_ip = cur_node_details[0]["mgmt_ip"]
+                    self.ssh_obj.fetch_distrib_logs(
+                        storage_node_ip=cur_node_ip,
+                        storage_node_id=node,
+                        logs_path=self.docker_logs_path
+                    )
+                
+                    self.ssh_obj.dump_lvstore(node_ip=self.mgmt_nodes[0],
+                                              storage_node_id=node)
+
+                # Validate logs
+                for lvol, det in self.lvol_mount_details.items():
+                    self.common_utils.validate_fio_test(det["Client"], log_file=det["Log"])
+                for cname, det in self.clone_mount_details.items():
+                    self.common_utils.validate_fio_test(det["Client"], log_file=det["Log"])
+
+                # Optional: wait for migration window after FIO completes
+                # (replace with your actual migration-check, if any)
+                self.logger.info("[LFNG] FIO validated; pausing briefly for migration window")
+                sleep_n_sec(10)
+
+                # Re-kick next 30min wave
+                self._kick_fio_for_all(runtime=self._per_wave_fio_runtime)
+                self.logger.info("[LFNG] Next FIO wave started")
+
+            self.logger.info(f"[LFNG] Iter {iteration} complete → starting next outage ASAP")
+            iteration += 1
\ No newline at end of file
diff --git a/e2e/stress_test/continuous_failover_ha_multi_outage.py b/e2e/stress_test/continuous_failover_ha_multi_outage.py
index fb5f6d507..e96a0b547 100644
--- a/e2e/stress_test/continuous_failover_ha_multi_outage.py
+++ b/e2e/stress_test/continuous_failover_ha_multi_outage.py
@@ -1,5 +1,6 @@
 from utils.common_utils import sleep_n_sec
 from datetime import datetime
+from collections import defaultdict
 from stress_test.continuous_failover_ha_multi_client import RandomMultiClientFailoverTest
 from exceptions.custom_exception import LvolNotConnectException
 import threading
@@ -8,13 +9,20 @@
 import os
 
 
+generated_sequences = set()
+
 def generate_random_sequence(length):
     letters = string.ascii_uppercase
     numbers = string.digits
     all_chars = letters + numbers
-    first_char = random.choice(letters)
-    remaining_chars = ''.join(random.choices(all_chars, k=length - 1))
-    return first_char + remaining_chars
+
+    while True:
+        first_char = random.choice(letters)
+        remaining_chars = ''.join(random.choices(all_chars, k=length-1))
+        result = first_char + remaining_chars
+        if result not in generated_sequences:
+            generated_sequences.add(result)
+            return result
 
 
 class RandomMultiClientMultiFailoverTest(RandomMultiClientFailoverTest):
@@ -25,7 +33,7 @@ class RandomMultiClientMultiFailoverTest(RandomMultiClientFailoverTest):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.total_lvols = 20
+        self.total_lvols = 40
         self.lvol_name = f"lvl{generate_random_sequence(15)}"
         self.clone_name = f"cln{generate_random_sequence(15)}"
         self.snapshot_name = f"snap{generate_random_sequence(15)}"
@@ -48,9 +56,12 @@ def __init__(self, **kwargs):
         self.lvols_without_sec_connect = []
         self.test_name = "n_plus_k_failover_multi_client_ha"
         self.outage_types = [
+            "graceful_shutdown",
+            "interface_full_network_interrupt"
+        ]
+        self.outage_types2 = [
             "container_stop",
             "graceful_shutdown",
-            "interface_partial_network_interrupt",
             "interface_full_network_interrupt"
         ]
         self.blocked_ports = None
@@ -61,30 +72,101 @@ def _initialize_outage_log(self):
         with open(self.outage_log_file, 'w') as log:
             log.write("Timestamp,Node,Outage_Type,Event\n")
 
-    def log_outage_event(self, node, outage_type, event):
-        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    def log_outage_event(self, node, outage_type, event, outage_time=0):
+        """Log an outage event to the outage log file.
+
+        Args:
+            node (str): Node UUID or IP where the event occurred.
+            outage_type (str): Type of outage (e.g., port_network_interrupt, container_stop, graceful_shutdown).
+            event (str): Event description (e.g., 'Outage started', 'Node restarted').
+            outage_time (int): Minutes to add to self.outage_start_time. If 0/None, use current time.
+        """
+        # Compute timestamp
+        if outage_time:
+            # Uses self.outage_start_time (epoch seconds) + outage_time (minutes)
+            base_epoch = getattr(self, "outage_start_time", None)
+            if isinstance(base_epoch, (int, float)) and base_epoch > 0:
+                ts_dt = datetime.fromtimestamp(int(base_epoch) + int(outage_time) * 60)
+            else:
+                # Fallback to now if outage_start_time is missing/invalid
+                ts_dt = datetime.now()
+        else:
+            ts_dt = datetime.now()
+
+        timestamp = ts_dt.strftime('%Y-%m-%d %H:%M:%S')
+
+        # Write the log line
         with open(self.outage_log_file, 'a') as log:
             log.write(f"{timestamp},{node},{outage_type},{event}\n")
 
+    def _build_reverse_secondary_map(self):
+        rev = defaultdict(set)        # secondary -> {primary,...}
+        for p, s in self.sn_primary_secondary_map.items():
+            if s:
+                rev[s].add(p)
+        return rev
+
+    def _pick_outage_nodes(self, primary_candidates, k):
+        rev = self._build_reverse_secondary_map()
+        order = primary_candidates[:]
+
+        random.shuffle(order)
+
+        chosen, blocked = [], set()
+        for node in order:
+            if node in blocked:
+                continue
+
+            chosen.append(node)
+            blocked.add(node)                            # itself
+            sec = self.sn_primary_secondary_map.get(node)
+            if sec:
+                blocked.add(sec)                         # its secondary
+            blocked.update(rev.get(node, ()))           # any primary whose secondary == node
+
+            if len(chosen) == k:
+                break
+
+        if len(chosen) < k:
+            raise Exception(
+                f"Cannot pick {k} nodes without primary/secondary conflicts; only {len(chosen)} possible with current topology."
+            )
+        return chosen
+
     def perform_n_plus_k_outages(self):
         """
-        Perform K (self.npcs) parallel outages as part of N+K configuration.
-        Ensure only primary nodes are selected for outage.
+        Select K outage nodes such that no two are in a primary/secondary
+        relationship (in either direction). Candidates = keys of the map.
         """
-        primary_nodes = [node for node in self.sn_nodes if not self.sbcli_utils.is_secondary_node(node)]
+        # Candidates are nodes that are primary *for someone* (map keys)
+        primary_candidates = list(self.sn_primary_secondary_map.keys())
+        self.current_outage_nodes = []
 
-        if len(primary_nodes) < self.npcs:
-            raise Exception(f"Not enough primary nodes to perform {self.npcs} outages. Found only {len(primary_nodes)}.")
+        if len(primary_candidates) < self.npcs:
+            raise Exception(
+                f"Need {self.npcs} outage nodes, but only {len(primary_candidates)} primary-role nodes exist."
+            )
 
-        outage_nodes = random.sample(primary_nodes, k=self.npcs)
+        outage_nodes = self._pick_outage_nodes(primary_candidates, self.npcs)
+        self.logger.info(f"Selected outage nodes: {outage_nodes}")
         outage_combinations = []
-
+        outage_num = 0
         for node in outage_nodes:
-            outage_type = random.choice(self.outage_types)
+            if outage_num == 0:
+                outage_type = random.choice(self.outage_types)
+                outage_num = 1
+            else:
+                outage_type = random.choice(self.outage_types2)
             node_details = self.sbcli_utils.get_storage_node_details(node)
             node_ip = node_details[0]["mgmt_ip"]
             node_rpc_port = node_details[0]["rpc_port"]
 
+            self.ssh_obj.fetch_distrib_logs(
+                storage_node_ip=node_ip,
+                storage_node_id=node,
+                logs_path=self.docker_logs_path
+            )
+
             self.logger.info(f"Performing {outage_type} on primary node {node}.")
             self.log_outage_event(node, outage_type, "Outage started")
 
@@ -105,26 +187,74 @@ def perform_n_plus_k_outages(self):
 
     def _graceful_shutdown_node(self, node):
         try:
-            self.sbcli_utils.suspend_node(node_uuid=node, expected_error_code=[503])
-            self.sbcli_utils.wait_for_storage_node_status(node, "suspended", timeout=1000)
-            self.sbcli_utils.shutdown_node(node_uuid=node, expected_error_code=[503])
-            self.sbcli_utils.wait_for_storage_node_status(node, "offline", timeout=1000)
+            sleep_n_sec(10)
+            max_retries = 10
+            retry_delay = 10  # seconds
+            # Retry mechanism for suspending the node
+            for attempt in range(max_retries):
+                try:
+                    if attempt == max_retries - 1:
+                        self.logger.info("[CHECK] Suspending Node via CLI as via API Fails.")
+                        self.ssh_obj.suspend_node(node=self.mgmt_nodes[0],
+                                                  node_id=node)
+                    else:
+                        self.sbcli_utils.suspend_node(node_uuid=node, expected_error_code=[503])
+                    self.sbcli_utils.wait_for_storage_node_status(node, "suspended", timeout=1000)
+                    break  # Exit loop if successful
+                except Exception as _:
+                    if attempt < max_retries - 2:
+                        self.logger.info(f"Attempt {attempt + 1} failed to suspend node. Retrying in {retry_delay} seconds...")
+                        sleep_n_sec(retry_delay)
+                    elif attempt < max_retries - 1:
+                        self.logger.info(f"Attempt {attempt + 1} failed to suspend node via API. Retrying in {retry_delay} seconds via CMD...")
+                        sleep_n_sec(retry_delay)
+                    else:
+                        self.logger.info("Max retries reached. Failed to suspend node.")
+                        raise  # Rethrow the last exception
+
+            sleep_n_sec(10)  # Wait before shutting down
+
+            # Retry mechanism for shutting down the node
+            for attempt in range(max_retries):
+                try:
+                    if attempt == max_retries - 1:
+                        self.logger.info("[CHECK] Shutting down Node via CLI as via API Fails.")
+                        self.ssh_obj.shutdown_node(node=self.mgmt_nodes[0],
+                                                   node_id=node,
+                                                   force=True)
+                    else:
+                        self.sbcli_utils.shutdown_node(node_uuid=node, force=True,
+                                                       expected_error_code=[503])
+                    self.sbcli_utils.wait_for_storage_node_status(node, "offline", timeout=1000)
+                    break  # Exit loop if successful
+                except Exception as _:
+                    if attempt < max_retries - 2:
+                        self.logger.info(f"Attempt {attempt + 1} failed to shutdown node. Retrying in {retry_delay} seconds...")
+                        sleep_n_sec(retry_delay)
+                    elif attempt < max_retries - 1:
+                        self.logger.info(f"Attempt {attempt + 1} failed to shutdown node via API. Retrying in {retry_delay} seconds via CMD...")
+                        sleep_n_sec(retry_delay)
+                    else:
+                        self.logger.info("Max retries reached. Failed to shutdown node.")
+                        raise  # Rethrow the last exception
         except Exception as e:
             self.logger.error(f"Failed graceful shutdown for node {node}: {str(e)}")
 
     def _disconnect_partial_interface(self, node, node_ip):
         active_interfaces = [nic["if_name"] for nic in self.sbcli_utils.get_storage_node_details(node)[0]["data_nics"]]
+        active_interfaces = ['eth1']
         self.disconnect_thread = threading.Thread(
             target=self.ssh_obj.disconnect_all_active_interfaces,
-            args=(node_ip, active_interfaces, 600)
+            args=(node_ip, active_interfaces, 300)
         )
         self.disconnect_thread.start()
 
     def _disconnect_full_interface(self, node, node_ip):
+        self.logger.info("Handling full interface based network interruption...")
         active_interfaces = self.ssh_obj.get_active_interfaces(node_ip)
         self.disconnect_thread = threading.Thread(
             target=self.ssh_obj.disconnect_all_active_interfaces,
-            args=(node_ip, active_interfaces, 600)
+            args=(node_ip, active_interfaces, 300)
         )
         self.disconnect_thread.start()
 
@@ -134,50 +264,81 @@ def delete_random_lvols(self, count):
             lvol for node, lvols in self.node_vs_lvol.items()
             if node not in self.current_outage_nodes for lvol in lvols
         ]
+
+        self.logger.info(f"Available Lvols: {available_lvols}")
         if len(available_lvols) < count:
             self.logger.warning("Not enough lvols available to delete the requested count.")
             count = len(available_lvols)
 
         for lvol in random.sample(available_lvols, count):
-            self.logger.info(f"Deleting lvol {lvol}")
+            self.logger.info(f"Deleting lvol {lvol}.")
             snapshots = self.lvol_mount_details[lvol]["snapshots"]
             to_delete = []
-
-            # Handle dependent clones
             for clone_name, clone_details in self.clone_mount_details.items():
                 if clone_details["snapshot"] in snapshots:
-                    self.common_utils.validate_fio_test(clone_details["Client"], clone_details["Log"])
+                    self.common_utils.validate_fio_test(clone_details["Client"],
+                                                        log_file=clone_details["Log"])
                     self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=False)
                     fio_pids = self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=True)
+                    sleep_n_sec(10)
                     for pid in fio_pids:
                         self.ssh_obj.kill_processes(clone_details["Client"], pid=pid)
+                    attempt = 1
+                    while len(fio_pids) > 2:
+                        self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=False)
+                        fio_pids = self.ssh_obj.find_process_name(clone_details["Client"], f"{clone_name}_fio", return_pid=True)
+                        if attempt >= 30:
+                            raise Exception("FIO not killed on clone")
+                        attempt += 1
+                        sleep_n_sec(20)
+                    
+                    sleep_n_sec(10)
                     self.ssh_obj.unmount_path(clone_details["Client"], f"/mnt/{clone_name}")
                     self.ssh_obj.remove_dir(clone_details["Client"], dir_path=f"/mnt/{clone_name}")
                     self.disconnect_lvol(clone_details['ID'])
-                    self.sbcli_utils.delete_lvol(clone_name)
+                    self.sbcli_utils.delete_lvol(clone_name, max_attempt=20, skip_error=True)
+                    sleep_n_sec(30)
                     if clone_name in self.lvols_without_sec_connect:
                         self.lvols_without_sec_connect.remove(clone_name)
                     to_delete.append(clone_name)
-
+                    self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone_name}_fio*"])
+                    self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone_name}_fio_iolog*"])
+                    self.ssh_obj.delete_files(clone_details["Client"], [f"/mnt/{clone_name}/*"])
+                    # self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone_name}*.log"])
             for del_key in to_delete:
                 del self.clone_mount_details[del_key]
-
-            # Delete snapshots
             for snapshot in snapshots:
                 snapshot_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snapshot)
+                # snapshot_node = self.snap_vs_node[snapshot]
+                # if snapshot_node not in skip_nodes:
                 self.ssh_obj.delete_snapshot(self.mgmt_nodes[0], snapshot_id=snapshot_id)
                 self.snapshot_names.remove(snapshot)
 
-            # Stop FIO and cleanup lvol
-            self.common_utils.validate_fio_test(self.lvol_mount_details[lvol]["Client"], self.lvol_mount_details[lvol]["Log"])
+            self.common_utils.validate_fio_test(self.lvol_mount_details[lvol]["Client"],
+                                                log_file=self.lvol_mount_details[lvol]["Log"])
             self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=False)
+            sleep_n_sec(10)
             fio_pids = self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=True)
             for pid in fio_pids:
                 self.ssh_obj.kill_processes(self.lvol_mount_details[lvol]["Client"], pid=pid)
+            attempt = 1
+            while len(fio_pids) > 2:
+                self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=False)
+                fio_pids = self.ssh_obj.find_process_name(self.lvol_mount_details[lvol]["Client"], f"{lvol}_fio", return_pid=True)
+                if attempt >= 30:
+                    raise Exception("FIO not killed on lvols")
+                attempt += 1
+                sleep_n_sec(20)
+
+            sleep_n_sec(10)
             self.ssh_obj.unmount_path(self.lvol_mount_details[lvol]["Client"], f"/mnt/{lvol}")
             self.ssh_obj.remove_dir(self.lvol_mount_details[lvol]["Client"], dir_path=f"/mnt/{lvol}")
             self.disconnect_lvol(self.lvol_mount_details[lvol]['ID'])
-            self.sbcli_utils.delete_lvol(lvol)
+            self.sbcli_utils.delete_lvol(lvol, max_attempt=20, skip_error=True)
+            self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/local-{lvol}_fio*"])
+            self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"])
+            self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"/mnt/{lvol}/*"])
+            # self.ssh_obj.delete_files(self.lvol_mount_details[lvol]["Client"], [f"{self.log_path}/{lvol}*.log"])
             if lvol in self.lvols_without_sec_connect:
                 self.lvols_without_sec_connect.remove(lvol)
             del self.lvol_mount_details[lvol]
@@ -190,14 +351,19 @@ def delete_random_lvols(self, count):
     def create_snapshots_and_clones(self):
         """Create snapshots and clones during an outage, avoiding lvols on outage nodes."""
         self.int_lvol_size += 1
+        skip_nodes = [node for node in self.sn_primary_secondary_map if self.sn_primary_secondary_map[node] in self.current_outage_nodes]
+        self.logger.info(f"Skip Nodes: {skip_nodes}")
+        for node in self.current_outage_nodes:
+            skip_nodes.append(node)
+        self.logger.info(f"Skip Nodes: {skip_nodes}")
         available_lvols = [
             lvol for node, lvols in self.node_vs_lvol.items()
-            if node not in self.current_outage_nodes for lvol in lvols
+            if node not in skip_nodes for lvol in lvols
         ]
         if not available_lvols:
             self.logger.warning("No available lvols to create snapshots and clones.")
             return
-
+        self.logger.info(f"Available lvols: {available_lvols}")
         for _ in range(3):
             random.shuffle(available_lvols)
             lvol = available_lvols[0]
@@ -205,69 +371,140 @@ def create_snapshots_and_clones(self):
             temp_name = generate_random_sequence(5)
             if snapshot_name in self.snapshot_names:
                 snapshot_name = f"{snapshot_name}_{temp_name}"
-
             try:
                 output, error = self.ssh_obj.add_snapshot(self.mgmt_nodes[0], self.lvol_mount_details[lvol]["ID"], snapshot_name)
-                if "(False," in output or "(False," in error:
-                    raise Exception(output or error)
+                if "(False," in output:
+                    raise Exception(output)
+                if "(False," in error:
+                    raise Exception(error)
             except Exception as e:
-                self.logger.warning(f"Snapshot creation failed: {e}")
-                continue
-
+                self.logger.warning(f"Snap creation fails with {str(e)}. Retrying with different name.")
+                try:
+                    snapshot_name = f"snap_{lvol}"
+                    temp_name = generate_random_sequence(5)
+                    snapshot_name = f"{snapshot_name}_{temp_name}"
+                    self.ssh_obj.add_snapshot(self.mgmt_nodes[0], self.lvol_mount_details[lvol]["ID"], snapshot_name)
+                except Exception as exp:
+                    self.logger.warning(f"Retry Snap creation fails with {str(exp)}.")
+                    continue
+                
             self.snapshot_names.append(snapshot_name)
+            lvol_node_id = self.sbcli_utils.get_lvol_details(
+                lvol_id=self.lvol_mount_details[lvol]["ID"])[0]["node_id"]
+            self.snap_vs_node[snapshot_name] = lvol_node_id
             self.lvol_mount_details[lvol]["snapshots"].append(snapshot_name)
-
             clone_name = f"clone_{generate_random_sequence(15)}"
+            if clone_name in list(self.clone_mount_details):
+                clone_name = f"{clone_name}_{temp_name}"
             sleep_n_sec(30)
             snapshot_id = self.ssh_obj.get_snapshot_id(self.mgmt_nodes[0], snapshot_name)
             try:
                 self.ssh_obj.add_clone(self.mgmt_nodes[0], snapshot_id, clone_name)
             except Exception as e:
-                self.logger.warning(f"Clone creation failed: {e}")
-                continue
-
+                self.logger.warning(f"Clone creation fails with {str(e)}. Retrying with different name.")
+                try:
+                    clone_name = f"clone_{generate_random_sequence(15)}"
+                    temp_name = generate_random_sequence(5)
+                    clone_name = f"{clone_name}_{temp_name}"
+                    self.ssh_obj.add_clone(self.mgmt_nodes[0], snapshot_id, clone_name)
+                except Exception as exp:
+                    self.logger.warning(f"Retry Clone creation fails with {str(exp)}.")
+                    continue
             fs_type = self.lvol_mount_details[lvol]["FS"]
             client = self.lvol_mount_details[lvol]["Client"]
             self.clone_mount_details[clone_name] = {
-                "ID": self.sbcli_utils.get_lvol_id(clone_name),
-                "Command": None,
-                "Mount": None,
-                "Device": None,
-                "MD5": None,
-                "FS": fs_type,
-                "Log": f"{self.log_path}/{clone_name}.log",
-                "snapshot": snapshot_name,
-                "Client": client
+                   "ID": self.sbcli_utils.get_lvol_id(clone_name),
+                   "Command": None,
+                   "Mount": None,
+                   "Device": None,
+                   "MD5": None,
+                   "FS": fs_type,
+                   "Log": f"{self.log_path}/{clone_name}.log",
+                   "snapshot": snapshot_name,
+                   "Client": client,
+                   "iolog_base_path": f"{self.log_path}/{clone_name}_fio_iolog"
             }
 
+            self.logger.info(f"Created clone {clone_name}.")
+
+            sleep_n_sec(3)
+
+            self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
+                                      command=f"{self.base_cmd} lvol list")
+
             connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=clone_name)
             self.clone_mount_details[clone_name]["Command"] = connect_ls
+
+            # if self.secondary_outage:
+            #     connect_ls = [connect_ls[0]]
+            #     self.lvols_without_sec_connect.append(clone_name)
+
             initial_devices = self.ssh_obj.get_devices(node=client)
             for connect_str in connect_ls:
                 _, error = self.ssh_obj.exec_command(node=client, command=connect_str)
                 if error:
-                    self.logger.warning(f"Clone connect failed: {error}")
+                    lvol_details = self.sbcli_utils.get_lvol_details(lvol_id=self.clone_mount_details[clone_name]["ID"])
+                    nqn = lvol_details[0]["nqn"]
+                    self.ssh_obj.disconnect_nvme(node=client, nqn_grep=nqn)
+                    self.logger.info(f"Connecting clone {clone_name} has error: {error}. Disconnect all connections for that clone!!")
+                    self.sbcli_utils.delete_lvol(lvol_name=clone_name, max_attempt=20, skip_error=True)
+                    sleep_n_sec(30)
+                    del self.clone_mount_details[clone_name]
                     continue
 
+            sleep_n_sec(3)
             final_devices = self.ssh_obj.get_devices(node=client)
-            lvol_device = next((f"/dev/{d.strip()}" for d in final_devices if d not in initial_devices), None)
+            lvol_device = None
+            for device in final_devices:
+                if device not in initial_devices:
+                    lvol_device = f"/dev/{device.strip()}"
+                    break
             if not lvol_device:
-                raise LvolNotConnectException("Clone device not found")
+                raise LvolNotConnectException("LVOL did not connect")
             self.clone_mount_details[clone_name]["Device"] = lvol_device
 
+            # Mount and Run FIO
             if fs_type == "xfs":
                 self.ssh_obj.clone_mount_gen_uuid(client, lvol_device)
-
             mount_point = f"{self.mount_path}/{clone_name}"
             self.ssh_obj.mount_path(node=client, device=lvol_device, mount_path=mount_point)
             self.clone_mount_details[clone_name]["Mount"] = mount_point
 
+            # clone_node_id = self.sbcli_utils.get_lvol_details(
+            #     lvol_id=self.lvol_mount_details[clone_name]["ID"])[0]["node_id"]
+            
+            # self.node_vs_lvol[clone_node_id].append(clone_name)
+
+            sleep_n_sec(10)
+
             self.ssh_obj.delete_files(client, [f"{mount_point}/*fio*"])
             self.ssh_obj.delete_files(client, [f"{self.log_path}/local-{clone_name}_fio*"])
-
+            self.ssh_obj.delete_files(client, [f"{self.log_path}/{clone_name}_fio_iolog*"])
+
+            sleep_n_sec(5)
+
+            # Start FIO
+            # fio_thread = threading.Thread(
+            #     target=self.ssh_obj.run_fio_test,
+            #     args=(client, None, self.clone_mount_details[clone_name]["Mount"], self.clone_mount_details[clone_name]["Log"]),
+            #     kwargs={
+            #         "size": self.fio_size,
+            #         "name": f"{clone_name}_fio",
+            #         "rw": "randrw",
+            #         "bs": f"{2 ** random.randint(2, 7)}K",
+            #         "nrfiles": 16,
+            #         "iodepth": 1,
+            #         "numjobs": 5,
+            #         "time_based": True,
+            #         "runtime": 2000,
+            #         "log_avg_msec": 1000,
+            #         "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"],
+            #         "debug": True,
+            #     },
+            # )
             fio_thread = threading.Thread(
                 target=self.ssh_obj.run_fio_test,
-                args=(client, None, mount_point, self.clone_mount_details[clone_name]["Log"]),
+                args=(client, None, self.clone_mount_details[clone_name]["Mount"], self.clone_mount_details[clone_name]["Log"]),
                 kwargs={
                     "size": self.fio_size,
                     "name": f"{clone_name}_fio",
@@ -278,15 +515,21 @@ def create_snapshots_and_clones(self):
                     "numjobs": 5,
                     "time_based": True,
                     "runtime": 2000,
+                    "log_avg_msec": 1000,
+                    "iolog_file": self.clone_mount_details[clone_name]["iolog_base_path"],
                 },
             )
             fio_thread.start()
             self.fio_threads.append(fio_thread)
+            self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}.")
 
-            self.logger.info(f"Created snapshot {snapshot_name} and clone {clone_name}")
-            self.sbcli_utils.resize_lvol(self.lvol_mount_details[lvol]["ID"], f"{self.int_lvol_size}G")
+            if self.lvol_mount_details[lvol]["ID"]:
+                self.sbcli_utils.resize_lvol(lvol_id=self.lvol_mount_details[lvol]["ID"],
+                                             new_size=f"{self.int_lvol_size}G")
             sleep_n_sec(10)
-            self.sbcli_utils.resize_lvol(self.clone_mount_details[clone_name]["ID"], f"{self.int_lvol_size}G")
+            if self.clone_mount_details[clone_name]["ID"]:
+                self.sbcli_utils.resize_lvol(lvol_id=self.clone_mount_details[clone_name]["ID"],
+                                             new_size=f"{self.int_lvol_size}G")
 
 
     def run(self):
@@ -301,6 +544,8 @@ def run(self):
         for result in storage_nodes['results']:
             self.sn_nodes.append(result["uuid"])
             self.sn_nodes_with_sec.append(result["uuid"])
+            self.sn_primary_secondary_map[result["uuid"]] = result["secondary_node_id"]
+        self.logger.info(f"Secondary node map: {self.sn_primary_secondary_map}")
 
         sleep_n_sec(30)
 
@@ -320,11 +565,23 @@ def run(self):
 
             for node, outage_type in outage_events:
                 self.current_outage_node = node
-                self.restart_nodes_after_failover(outage_type)
+                if outage_type == "container_stop" and self.npcs > 1:
+                    self.restart_nodes_after_failover(outage_type, True)
+                else:
+                    self.restart_nodes_after_failover(outage_type)
 
             self.logger.info("Waiting for fallback recovery.")
             sleep_n_sec(100)
 
+            for node in self.sn_nodes_with_sec:
+                cur_node_details = self.sbcli_utils.get_storage_node_details(node)
+                cur_node_ip = cur_node_details[0]["mgmt_ip"]
+                self.ssh_obj.fetch_distrib_logs(
+                    storage_node_ip=cur_node_ip,
+                    storage_node_id=node,
+                    logs_path=self.docker_logs_path
+                )
+
             time_duration = self.common_utils.calculate_time_duration(
                 start_timestamp=self.outage_start_time,
                 end_timestamp=self.outage_end_time
@@ -343,12 +600,27 @@ def run(self):
             # for node, outage_type in outage_events:
             #     if not self.sbcli_utils.is_secondary_node(node):
             self.validate_migration_for_node(self.outage_start_time, 2000, None, 60, no_task_ok=no_task_ok)
+            self.common_utils.manage_fio_threads(self.fio_node, self.fio_threads, timeout=20000)
 
             for clone, clone_details in self.clone_mount_details.items():
                 self.common_utils.validate_fio_test(clone_details["Client"], clone_details["Log"])
+                self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/local-{clone}_fio*"])
+                self.ssh_obj.delete_files(clone_details["Client"], [f"{self.log_path}/{clone}_fio_iolog*"])
 
             for lvol, lvol_details in self.lvol_mount_details.items():
                 self.common_utils.validate_fio_test(lvol_details["Client"], lvol_details["Log"])
+                self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/local-{lvol}_fio*"])
+                self.ssh_obj.delete_files(lvol_details["Client"], [f"{self.log_path}/{lvol}_fio_iolog*"])
 
             self.logger.info(f"N+K failover iteration {iteration} complete.")
+
+            for node in self.sn_nodes_with_sec:
+                cur_node_details = self.sbcli_utils.get_storage_node_details(node)
+                cur_node_ip = cur_node_details[0]["mgmt_ip"]
+                self.ssh_obj.fetch_distrib_logs(
+                    storage_node_ip=cur_node_ip,
+                    storage_node_id=node,
+                    logs_path=self.docker_logs_path
+                )
             iteration += 1
+
diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py
index bd06f06f7..a50a61726 100644
--- a/e2e/utils/ssh_utils.py
+++ b/e2e/utils/ssh_utils.py
@@ -13,6 +13,10 @@
 import string
 import re
 import subprocess
+import shlex
+import socket
+from collections import defaultdict
+from typing import Optional, List
 
 
 SSH_KEY_LOCATION = os.path.join(Path.home(), ".ssh", os.environ.get("KEY_NAME"))
@@ -47,31 +51,227 @@ def __init__(self, bastion_server):
         self.log_monitor_threads = {}
         self.log_monitor_stop_flags = {}
         self.ssh_semaphore = threading.Semaphore(10)  # Max 10 SSH calls in parallel (tune as needed)
+        self._bastion_client = None
+        self._reconnect_locks = defaultdict(threading.Lock)   
+        self.ssh_pass = None
+
+    def _candidate_usernames(self, explicit_user) -> List[str]:
+        if explicit_user:
+            if isinstance(explicit_user, (list, tuple)):
+                return list(explicit_user)
+            return [str(explicit_user)]
+        return ["ec2-user", "ubuntu", "rocky", "root"]
+    
+    def _load_private_keys(self) -> List[paramiko.PKey]:
+        """
+        Try Ed25519 then RSA. If SSH_KEY_LOCATION/env points to a file, use it.
+        Else try ~/.ssh/id_ed25519 and ~/.ssh/id_rsa. If SSH_KEY_PATH is a dir, load all files from it.
+        """
+        paths = []
+        # explicit single file via KEY_NAME → SSH_KEY_LOCATION
+        if SSH_KEY_LOCATION and os.path.isfile(SSH_KEY_LOCATION):
+            paths.append(SSH_KEY_LOCATION)
+        # defaults
+        home = os.path.join(Path.home(), ".ssh")
+        paths.extend([os.path.join(home, "id_ed25519"), os.path.join(home, "id_rsa")])
+
+        keys = []
+        seen = set()
+        for p in paths:
+            if not os.path.exists(p) or p in seen:
+                continue
+            seen.add(p)
+            try:
+                keys.append(paramiko.Ed25519Key.from_private_key_file(p))
+                continue
+            except Exception:
+                pass
+            try:
+                keys.append(paramiko.RSAKey.from_private_key_file(p))
+            except Exception:
+                pass
+        if not keys and not self.ssh_pass:
+            raise FileNotFoundError("No usable SSH private key found and SSH_PASS not set.")
+        return keys
+
+    def _try_connect(self, host: str, username: str, pkey: Optional[paramiko.PKey], password: Optional[str], sock=None, timeout=30):
+        cli = paramiko.SSHClient()
+        cli.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        cli.connect(
+            hostname=host,
+            username=username,
+            pkey=pkey,
+            password=(password if pkey is None else None),
+            timeout=timeout,
+            banner_timeout=timeout,
+            auth_timeout=timeout,
+            allow_agent=False,
+            look_for_keys=False,
+            sock=sock
+        )
+        return cli
+
+    # def connect(self, address: str, port: int = 22,
+    #         bastion_server_address: str = None,
+    #         username: str = "ec2-user",
+    #         is_bastion_server: bool = False):
+    #     """Connect to cluster nodes"""
+    #     # --- prep usernames list ---
+    #     default_users = ["ec2-user", "ubuntu", "rocky", "root"]
+    #     if getattr(self, "ssh_user", None):
+    #         if isinstance(self.ssh_user, (list, tuple)):
+    #             usernames = list(self.ssh_user)
+    #         else:
+    #             usernames = [str(self.ssh_user)]
+    #     else:
+    #         usernames = default_users
+
+    #     # Load key (Ed25519 -> RSA fallback)
+    #     if not os.path.exists(SSH_KEY_LOCATION):
+    #         raise FileNotFoundError(f"SSH private key not found at {SSH_KEY_LOCATION}")
+    #     try:
+    #         private_key = paramiko.Ed25519Key(filename=SSH_KEY_LOCATION)
+    #     except Exception:
+    #         private_key = paramiko.RSAKey.from_private_key_file(SSH_KEY_LOCATION)
+
+    #     # Helper to store/replace a connection
+    #     def _store(host, client):
+    #         if self.ssh_connections.get(host):
+    #             try:
+    #                 self.ssh_connections[host].close()
+    #             except Exception:
+    #                 pass
+    #         self.ssh_connections[host] = client
+
+    #     # ---------- direct connection ----------
+    #     bastion_server_address = bastion_server_address or self.bastion_server
+    #     if not bastion_server_address:
+    #         self.logger.info(f"Connecting directly to {address} on port {port}...")
+    #         last_err = None
+    #         for user in usernames:
+    #             ssh = paramiko.SSHClient()
+    #             ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    #             try:
+    #                 ssh.connect(
+    #                     hostname=address,
+    #                     username=user,
+    #                     port=port,
+    #                     pkey=private_key,
+    #                     timeout=300,
+    #                     banner_timeout=30,
+    #                     auth_timeout=30,
+    #                     allow_agent=False,
+    #                     look_for_keys=False,
+    #                 )
+    #                 self.logger.info(f"Connected directly to {address} as '{user}'.")
+    #                 _store(address, ssh)
+    #                 return
+    #             except Exception as e:
+    #                 last_err = e
+    #                 self.logger.info(f"Direct login failed for '{user}': {repr(e)}")
+    #                 try:
+    #                     ssh.close()
+    #                 except Exception:
+    #                     pass
+    #         raise Exception(f"All usernames failed for {address}. Last error: {repr(last_err)}")
+
+    #     # ---------- connect to bastion ----------
+    #     self.logger.info(f"Connecting to bastion server {bastion_server_address}...")
+    #     bastion_ssh = paramiko.SSHClient()
+    #     bastion_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    #     last_err = None
+    #     bastion_user_used = None
+    #     for b_user in usernames:
+    #         try:
+    #             bastion_ssh.connect(
+    #                 hostname=bastion_server_address,
+    #                 username=b_user,
+    #                 port=port,
+    #                 pkey=private_key,
+    #                 timeout=300,
+    #                 banner_timeout=30,
+    #                 auth_timeout=30,
+    #                 allow_agent=False,
+    #                 look_for_keys=False,
+    #             )
+    #             self.logger.info(f"Connected to bastion as '{b_user}'.")
+    #             _store(bastion_server_address, bastion_ssh)
+    #             bastion_user_used = b_user
+    #             break
+    #         except Exception as e:
+    #             last_err = e
+    #             self.logger.info(f"Bastion login failed for '{b_user}': {repr(e)}")
+    #     if bastion_user_used is None:
+    #         raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}")
+    #     if is_bastion_server:
+    #         return  # caller only needed bastion
+
+    #     # ---------- tunnel to target through bastion ----------
+    #     self.logger.info(f"Connecting to target server {address} through bastion server...")
+    #     transport = bastion_ssh.get_transport()
+    #     last_err = None
+    #     for user in usernames:
+    #         # IMPORTANT: open a NEW channel for each username attempt
+    #         try:
+    #             channel = transport.open_channel(
+    #                 "direct-tcpip",
+    #                 (address, port),
+    #                 ("localhost", 0),
+    #             )
+    #         except paramiko.ssh_exception.ChannelException as ce:
+    #             self.logger.error(
+    #                 f"Channel open failed: {repr(ce)} — check AllowTcpForwarding/PermitOpen on bastion."
+    #             )
+    #             raise
+    #         target_ssh = paramiko.SSHClient()
+    #         target_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    #         try:
+    #             target_ssh.connect(
+    #                 address,
+    #                 username=user,
+    #                 port=port,
+    #                 sock=channel,
+    #                 pkey=private_key,
+    #                 timeout=300,
+    #                 banner_timeout=30,
+    #                 auth_timeout=30,
+    #                 allow_agent=False,
+    #                 look_for_keys=False,
+    #             )
+    #             self.logger.info(f"Connected to {address} as '{user}' via bastion '{bastion_user_used}'.")
+    #             _store(address, target_ssh)
+    #             return
+    #         except Exception as e:
+    #             last_err = e
+    #             self.logger.info(f"Target login failed for '{user}': {repr(e)}")
+    #             try:
+    #                 target_ssh.close()
+    #             except Exception:
+    #                 pass
+    #             try:
+    #                 channel.close()
+    #             except Exception:
+    #                 pass
+
+    #     raise Exception(
+    #         f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}"
+    #     )
 
     def connect(self, address: str, port: int = 22,
             bastion_server_address: str = None,
             username: str = "ec2-user",
             is_bastion_server: bool = False):
-        """Connect to cluster nodes"""
-        # --- prep usernames list ---
-        default_users = ["ec2-user", "ubuntu", "rocky", "root"]
-        if getattr(self, "ssh_user", None):
-            if isinstance(self.ssh_user, (list, tuple)):
-                usernames = list(self.ssh_user)
-            else:
-                usernames = [str(self.ssh_user)]
-        else:
-            usernames = default_users
+        """
+        Connect to a host directly or via bastion, trying multiple usernames and keys,
+        with optional password fallback.
+        """
+        # Resolve bastion
+        bastion_server_address = bastion_server_address or self.bastion_server
 
-        # Load key (Ed25519 -> RSA fallback)
-        if not os.path.exists(SSH_KEY_LOCATION):
-            raise FileNotFoundError(f"SSH private key not found at {SSH_KEY_LOCATION}")
-        try:
-            private_key = paramiko.Ed25519Key(filename=SSH_KEY_LOCATION)
-        except Exception:
-            private_key = paramiko.RSAKey.from_private_key_file(SSH_KEY_LOCATION)
+        usernames = self._candidate_usernames(self.ssh_user or username)
+        keys = self._load_private_keys()
+        password = self.ssh_pass
 
-        # Helper to store/replace a connection
         def _store(host, client):
             if self.ssh_connections.get(host):
                 try:
@@ -80,230 +280,291 @@ def _store(host, client):
                     pass
             self.ssh_connections[host] = client
 
-        # ---------- direct connection ----------
-        bastion_server_address = bastion_server_address or self.bastion_server
+        # --- NO BASTION: direct connect ---
         if not bastion_server_address:
-            self.logger.info(f"Connecting directly to {address} on port {port}...")
             last_err = None
+            self.logger.info(f"Connecting directly to {address} on port {port}...")
             for user in usernames:
-                ssh = paramiko.SSHClient()
-                ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-                try:
-                    ssh.connect(
-                        hostname=address,
-                        username=user,
-                        port=port,
-                        pkey=private_key,
-                        timeout=300,
-                        banner_timeout=30,
-                        auth_timeout=30,
-                        allow_agent=False,
-                        look_for_keys=False,
-                    )
-                    self.logger.info(f"Connected directly to {address} as '{user}'.")
-                    _store(address, ssh)
-                    return
-                except Exception as e:
-                    last_err = e
-                    self.logger.info(f"Direct login failed for '{user}': {repr(e)}")
+                # try keys
+                for key in keys:
                     try:
-                        ssh.close()
-                    except Exception:
-                        pass
+                        cli = self._try_connect(address, user, key, None, timeout=30)
+                        self.logger.info(f"Connected directly to {address} as '{user}'.")
+                        _store(address, cli)
+                        return
+                    except Exception as e:
+                        last_err = e
+                # then password
+                if password:
+                    try:
+                        cli = self._try_connect(address, user, None, password, timeout=30)
+                        self.logger.info(f"Connected directly to {address} as '{user}' (password).")
+                        _store(address, cli)
+                        return
+                    except Exception as e:
+                        last_err = e
             raise Exception(f"All usernames failed for {address}. Last error: {repr(last_err)}")
 
-        # ---------- connect to bastion ----------
-        self.logger.info(f"Connecting to bastion server {bastion_server_address}...")
-        bastion_ssh = paramiko.SSHClient()
-        bastion_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-        last_err = None
-        bastion_user_used = None
-        for b_user in usernames:
-            try:
-                bastion_ssh.connect(
-                    hostname=bastion_server_address,
-                    username=b_user,
-                    port=port,
-                    pkey=private_key,
-                    timeout=300,
-                    banner_timeout=30,
-                    auth_timeout=30,
-                    allow_agent=False,
-                    look_for_keys=False,
-                )
-                self.logger.info(f"Connected to bastion as '{b_user}'.")
-                _store(bastion_server_address, bastion_ssh)
-                bastion_user_used = b_user
+        # --- VIA BASTION ---
+        # ensure bastion client (reuse if alive)
+        if (not self._bastion_client) or (not self._bastion_client.get_transport()) or (not self._bastion_client.get_transport().is_active()):
+            last_err = None
+            self.logger.info(f"Connecting to bastion server {bastion_server_address}...")
+            for b_user in self._candidate_usernames(self.ssh_user or username):
+                for key in keys:
+                    try:
+                        cli = self._try_connect(bastion_server_address, b_user, key, None, timeout=30)
+                        self._bastion_client = cli
+                        self.logger.info(f"Connected to bastion as '{b_user}'.")
+                        break
+                    except Exception as e:
+                        last_err = e
+                else:
+                    if password:
+                        try:
+                            cli = self._try_connect(bastion_server_address, b_user, None, password, timeout=30)
+                            self._bastion_client = cli
+                            self.logger.info(f"Connected to bastion as '{b_user}' (password).")
+                            break
+                        except Exception as e:
+                            last_err = e
+                    continue
                 break
-            except Exception as e:
-                last_err = e
-                self.logger.info(f"Bastion login failed for '{b_user}': {repr(e)}")
-        if bastion_user_used is None:
-            raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}")
+            if (not self._bastion_client) or (not self._bastion_client.get_transport()) or (not self._bastion_client.get_transport().is_active()):
+                raise Exception(f"All usernames failed for bastion {bastion_server_address}. Last error: {repr(last_err)}")
+
         if is_bastion_server:
-            return  # caller only needed bastion
+            # caller only wanted bastion connection open
+            _store(bastion_server_address, self._bastion_client)
+            return
 
-        # ---------- tunnel to target through bastion ----------
+        # open a channel through bastion → target
         self.logger.info(f"Connecting to target server {address} through bastion server...")
-        transport = bastion_ssh.get_transport()
+        bastion_transport = self._bastion_client.get_transport()
+
         last_err = None
         for user in usernames:
-            # IMPORTANT: open a NEW channel for each username attempt
-            try:
-                channel = transport.open_channel(
-                    "direct-tcpip",
-                    (address, port),
-                    ("localhost", 0),
-                )
-            except paramiko.ssh_exception.ChannelException as ce:
-                self.logger.error(
-                    f"Channel open failed: {repr(ce)} — check AllowTcpForwarding/PermitOpen on bastion."
-                )
-                raise
-            target_ssh = paramiko.SSHClient()
-            target_ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-            try:
-                target_ssh.connect(
-                    address,
-                    username=user,
-                    port=port,
-                    sock=channel,
-                    pkey=private_key,
-                    timeout=300,
-                    banner_timeout=30,
-                    auth_timeout=30,
-                    allow_agent=False,
-                    look_for_keys=False,
-                )
-                self.logger.info(f"Connected to {address} as '{user}' via bastion '{bastion_user_used}'.")
-                _store(address, target_ssh)
-                return
-            except Exception as e:
-                last_err = e
-                self.logger.info(f"Target login failed for '{user}': {repr(e)}")
+            # new channel for each attempt
+            chan = bastion_transport.open_channel("direct-tcpip", (address, port), ("127.0.0.1", 0))
+            # try keys
+            for key in keys:
                 try:
-                    target_ssh.close()
-                except Exception:
-                    pass
+                    cli = self._try_connect(address, user, key, None, sock=chan, timeout=30)
+                    self.logger.info(f"Connected to {address} as '{user}' via bastion.")
+                    _store(address, cli)
+                    return
+                except Exception as e:
+                    last_err = e
+            # then password
+            if password:
                 try:
-                    channel.close()
-                except Exception:
-                    pass
-
-        raise Exception(
-            f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}"
-        )
-
+                    cli = self._try_connect(address, user, None, password, sock=chan, timeout=30)
+                    self.logger.info(f"Connected to {address} as '{user}' via bastion (password).")
+                    _store(address, cli)
+                    return
+                except Exception as e:
+                    last_err = e
+            try:
+                chan.close()
+            except Exception:
+                pass
+
+        raise Exception(f"Tunnel established, but all usernames failed for target {address}. Last error: {repr(last_err)}")
+
+
+
+    # def exec_command(self, node, command, timeout=360, max_retries=3, stream_callback=None, supress_logs=False):
+    #     """Executes a command on a given machine with streaming output and retry mechanism.
+
+    #     Args:
+    #         node (str): Machine to run command on.
+    #         command (str): Command to run.
+    #         timeout (int): Timeout in seconds.
+    #         max_retries (int): Number of retries in case of failures.
+    #         stream_callback (callable, optional): A callback function for streaming output. Defaults to None.
+
+    #     Returns:
+    #         tuple: Final output and error strings after command execution.
+    #     """
+    #     retry_count = 0
+    #     while retry_count < max_retries:
+    #         with self.ssh_semaphore:
+    #             ssh_connection = self.ssh_connections.get(node)
+    #             try:
+    #                 # Ensure the SSH connection is active, otherwise reconnect
+    #                 if not ssh_connection or not ssh_connection.get_transport().is_active() or retry_count > 0:
+    #                     self.logger.info(f"Reconnecting SSH to node {node}")
+    #                     self.connect(
+    #                         address=node,
+    #                         is_bastion_server=True if node == self.bastion_server else False
+    #                     )
+    #                     ssh_connection = self.ssh_connections[node]
+                    
+    #                 if not supress_logs:
+    #                     self.logger.info(f"Executing command: {command}")
+    #                 stdin, stdout, stderr = ssh_connection.exec_command(command, timeout=timeout)
+
+    #                 output = []
+    #                 error = []
+
+    #                 # Read stdout and stderr dynamically if stream_callback is provided
+    #                 if stream_callback:
+    #                     while not stdout.channel.exit_status_ready():
+    #                         # Process stdout
+    #                         if stdout.channel.recv_ready():
+    #                             chunk = stdout.channel.recv(1024).decode()
+    #                             output.append(chunk)
+    #                             stream_callback(chunk, is_error=False)  # Callback for stdout
+
+    #                         # Process stderr
+    #                         if stderr.channel.recv_stderr_ready():
+    #                             chunk = stderr.channel.recv_stderr(1024).decode()
+    #                             error.append(chunk)
+    #                             stream_callback(chunk, is_error=True)  # Callback for stderr
+
+    #                         time.sleep(0.1)
+
+    #                     # Finalize any remaining output
+    #                     if stdout.channel.recv_ready():
+    #                         chunk = stdout.channel.recv(1024).decode()
+    #                         output.append(chunk)
+    #                         stream_callback(chunk, is_error=False)
+
+    #                     if stderr.channel.recv_stderr_ready():
+    #                         chunk = stderr.channel.recv_stderr(1024).decode()
+    #                         error.append(chunk)
+    #                         stream_callback(chunk, is_error=True)
+    #                 else:
+    #                     # Default behavior: Read the entire output at once
+    #                     output = stdout.read().decode()
+    #                     error = stderr.read().decode()
+
+    #                 # Combine the output into strings
+    #                 output = "".join(output) if isinstance(output, list) else output
+    #                 error = "".join(error) if isinstance(error, list) else error
+
+    #                 # Log the results
+    #                 if output:
+    #                     if not supress_logs:
+    #                         self.logger.info(f"Command output: {output}")
+    #                 if error:
+    #                     if not supress_logs:
+    #                         self.logger.error(f"Command error: {error}")
+
+    #                 if not output and not error:
+    #                     if not supress_logs:
+    #                         self.logger.warning(f"Command '{command}' executed but returned no output or error.")
+
+    #                 return output, error
+
+    #             except EOFError as e:
+    #                 self.logger.error(f"EOFError occurred while executing command '{command}': {e}. Retrying ({retry_count + 1}/{max_retries})...")
+    #                 retry_count += 1
+    #                 time.sleep(2)  # Short delay before retrying
+
+    #             except paramiko.SSHException as e:
+    #                 self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...")
+    #                 retry_count += 1
+    #                 time.sleep(2)  # Short delay before retrying
+
+    #             except paramiko.buffered_pipe.PipeTimeout as e:
+    #                 self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...")
+    #                 retry_count += 1
+    #                 time.sleep(2)  # Short delay before retrying
+
+    #             except Exception as e:
+    #                 self.logger.error(f"SSH command failed (General Exception): {e}. Retrying ({retry_count + 1}/{max_retries})...")
+    #                 retry_count += 1
+    #                 time.sleep(2)  # Short delay before retrying
+
+    #     # If we exhaust retries, return failure
+    #     self.logger.error(f"Failed to execute command '{command}' on node {node} after {max_retries} retries.")
+    #     return "", "Command failed after max retries"
 
     def exec_command(self, node, command, timeout=360, max_retries=3, stream_callback=None, supress_logs=False):
-        """Executes a command on a given machine with streaming output and retry mechanism.
-
-        Args:
-            node (str): Machine to run command on.
-            command (str): Command to run.
-            timeout (int): Timeout in seconds.
-            max_retries (int): Number of retries in case of failures.
-            stream_callback (callable, optional): A callback function for streaming output. Defaults to None.
-
-        Returns:
-            tuple: Final output and error strings after command execution.
         """
-        retry_count = 0
-        while retry_count < max_retries:
+        Execute a command with auto-reconnect (serialized per node), optional streaming,
+        and proper exit-status capture to reduce “ran but no output” confusion.
+        """
+        retry = 0
+        while retry < max_retries:
             with self.ssh_semaphore:
-                ssh_connection = self.ssh_connections.get(node)
+                # serialize reconnect attempts per node
+                lock = self._reconnect_locks[node]
+                with lock:
+                    ssh = self.ssh_connections.get(node)
+                    if not ssh or not ssh.get_transport() or not ssh.get_transport().is_active() or retry > 0:
+                        if not supress_logs:
+                            self.logger.info(f"Reconnecting SSH to node {node}")
+                        # if node is the bastion itself
+                        self.connect(node, is_bastion_server=(node == self.bastion_server))
+                        ssh = self.ssh_connections[node]
+
                 try:
-                    # Ensure the SSH connection is active, otherwise reconnect
-                    if not ssh_connection or not ssh_connection.get_transport().is_active() or retry_count > 0:
-                        self.logger.info(f"Reconnecting SSH to node {node}")
-                        self.connect(
-                            address=node,
-                            is_bastion_server=True if node == self.bastion_server else False
-                        )
-                        ssh_connection = self.ssh_connections[node]
-                    
                     if not supress_logs:
                         self.logger.info(f"Executing command: {command}")
-                    stdin, stdout, stderr = ssh_connection.exec_command(command, timeout=timeout)
+                    stdin, stdout, stderr = ssh.exec_command(command, timeout=timeout)
+                    output_chunks, error_chunks = [], []
 
-                    output = []
-                    error = []
-
-                    # Read stdout and stderr dynamically if stream_callback is provided
                     if stream_callback:
                         while not stdout.channel.exit_status_ready():
-                            # Process stdout
                             if stdout.channel.recv_ready():
-                                chunk = stdout.channel.recv(1024).decode()
-                                output.append(chunk)
-                                stream_callback(chunk, is_error=False)  # Callback for stdout
-
-                            # Process stderr
+                                chunk = stdout.channel.recv(8192).decode(errors="replace")
+                                output_chunks.append(chunk)
+                                stream_callback(chunk, is_error=False)
                             if stderr.channel.recv_stderr_ready():
-                                chunk = stderr.channel.recv_stderr(1024).decode()
-                                error.append(chunk)
-                                stream_callback(chunk, is_error=True)  # Callback for stderr
-
-                            time.sleep(0.1)
-
-                        # Finalize any remaining output
-                        if stdout.channel.recv_ready():
-                            chunk = stdout.channel.recv(1024).decode()
-                            output.append(chunk)
+                                chunk = stderr.channel.recv_stderr(8192).decode(errors="replace")
+                                error_chunks.append(chunk)
+                                stream_callback(chunk, is_error=True)
+                            time.sleep(0.05)
+
+                        # flush remaining
+                        while stdout.channel.recv_ready():
+                            chunk = stdout.channel.recv(8192).decode(errors="replace")
+                            output_chunks.append(chunk)
                             stream_callback(chunk, is_error=False)
-
-                        if stderr.channel.recv_stderr_ready():
-                            chunk = stderr.channel.recv_stderr(1024).decode()
-                            error.append(chunk)
+                        while stderr.channel.recv_stderr_ready():
+                            chunk = stderr.channel.recv_stderr(8192).decode(errors="replace")
+                            error_chunks.append(chunk)
                             stream_callback(chunk, is_error=True)
+
+                        exit_status = stdout.channel.recv_exit_status()
+                        out = "".join(output_chunks)
+                        err = "".join(error_chunks)
                     else:
-                        # Default behavior: Read the entire output at once
-                        output = stdout.read().decode()
-                        error = stderr.read().decode()
+                        out = stdout.read().decode(errors="replace")
+                        err = stderr.read().decode(errors="replace")
+                        exit_status = stdout.channel.recv_exit_status()
 
-                    # Combine the output into strings
-                    output = "".join(output) if isinstance(output, list) else output
-                    error = "".join(error) if isinstance(error, list) else error
+                    if (not supress_logs) and out:
+                        self.logger.info(f"Command output: {out.strip()[:2000]}")
+                    if (not supress_logs) and err:
+                        self.logger.error(f"Command error: {err.strip()[:2000]}")
 
-                    # Log the results
-                    if output:
-                        if not supress_logs:
-                            self.logger.info(f"Command output: {output}")
-                    if error:
-                        if not supress_logs:
-                            self.logger.error(f"Command error: {error}")
+                    if exit_status != 0 and not err:
+                        # some tools write nothing on stderr but non-zero exit
+                        err = f"Non-zero exit status: {exit_status}"
 
-                    if not output and not error:
+                    if not out and not err:
                         if not supress_logs:
                             self.logger.warning(f"Command '{command}' executed but returned no output or error.")
 
-                    return output, error
-
-                except EOFError as e:
-                    self.logger.error(f"EOFError occurred while executing command '{command}': {e}. Retrying ({retry_count + 1}/{max_retries})...")
-                    retry_count += 1
-                    time.sleep(2)  # Short delay before retrying
+                    return out, err
 
-                except paramiko.SSHException as e:
-                    self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...")
-                    retry_count += 1
-                    time.sleep(2)  # Short delay before retrying
-
-                except paramiko.buffered_pipe.PipeTimeout as e:
-                    self.logger.error(f"SSH command failed: {e}. Retrying ({retry_count + 1}/{max_retries})...")
-                    retry_count += 1
-                    time.sleep(2)  # Short delay before retrying
+                except (EOFError, paramiko.SSHException, paramiko.buffered_pipe.PipeTimeout, socket.error) as e:
+                    retry += 1
+                    self.logger.error(f"SSH command failed ({type(e).__name__}): {e}. Retrying ({retry}/{max_retries})...")
+                    time.sleep(min(2 * retry, 5))
 
                 except Exception as e:
-                    self.logger.error(f"SSH command failed (General Exception): {e}. Retrying ({retry_count + 1}/{max_retries})...")
-                    retry_count += 1
-                    time.sleep(2)  # Short delay before retrying
+                    retry += 1
+                    self.logger.error(f"SSH command failed (General): {e}. Retrying ({retry}/{max_retries})...")
+                    time.sleep(min(2 * retry, 5))
 
-        # If we exhaust retries, return failure
         self.logger.error(f"Failed to execute command '{command}' on node {node} after {max_retries} retries.")
         return "", "Command failed after max retries"
 
-    
+
     def format_disk(self, node, device, fs_type="ext4"):
         """Format disk on the given node
 
@@ -362,14 +623,133 @@ def get_devices(self, node):
 
         return output.strip().split()
     
-    def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs):
-        """Run FIO Tests with given params and proper logging for MD5 error timestamp tracing.
+    # def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs):
+    #     """
+    #     Run FIO with optional 'ensure_running' that verifies process presence and retries start  up to N times.
+
+    #     kwargs:
+    #     - ensure_running: bool (default False)
+    #     - max_start_retries: int (default 3)
+    #     """
+    #     location = ""
+    #     if device:
+    #         location = f"--filename={device}"
+    #     if directory:
+    #         location = f"--directory={directory}"
+
+    #     runtime     = kwargs.get("runtime", 3600)
+    #     name        = kwargs.get("name", f"fio_{_rid(6)}")
+    #     ioengine    = kwargs.get("ioengine", "libaio")
+    #     iodepth     = kwargs.get("iodepth", 1)
+    #     time_based  = "--time_based" if kwargs.get("time_based", True) else ""
+    #     rw          = kwargs.get("rw", "randrw")
+    #     bs          = kwargs.get("bs", "4K")
+    #     size        = kwargs.get("size", "1G")
+    #     rwmixread   = kwargs.get("rwmixread", 70)
+    #     numjobs     = kwargs.get("numjobs", 2)
+    #     nrfiles     = kwargs.get("nrfiles", 8)
+    #     log_avg_ms  = kwargs.get("log_avg_msec", 1000)
+    #     output_fmt  = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else ''
+    #     output_file = f" --output={kwargs['output_file']} " if kwargs.get("output_file") else ''
+    #     iolog_base  = kwargs.get("iolog_file")
+
+    #     iolog_opt   = f"--write_iolog={iolog_base}" if iolog_base else ""
+    #     log_opt     = f"--log_avg_msec={log_avg_ms}" if log_avg_ms else ""
+
+    #     command = (
+    #         f"sudo fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} "
+    #         f"{time_based} --runtime={runtime} --rw={rw} --max_latency=20s --bs={bs} --size={size} --rwmixread={rwmixread} "
+    #         f"--verify=md5 --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} "
+    #         f"{log_opt} {iolog_opt} {output_fmt}{output_file}"
+    #     )
+    #     if kwargs.get("debug"):
+    #         command += " --debug=all"
+    #     if log_file:
+    #         command += f" > {log_file} 2>&1"
+
+    #     ensure_running   = bool(kwargs.get("ensure_running", False))
+    #     max_start_retries = int(kwargs.get("max_start_retries", 3))
+
+    #     launch_retries = 3
+    #     for attempt in range(1, launch_retries + 1):
+
+    #         try:
+    #             self.logger.info(f"Starting FIO on {node}: {name} → {location} (attempt {attempt}/{launch_retries})")
+    #             self.exec_command(node=node, command=f"sudo {command}", max_retries=2)
+    #             break
+    #         except Exception as e:
+    #             self.logger.error(f"FIO start failed: {e}")
+    #             if attempt == launch_retries:
+    #                 raise
+    #             time.sleep(1.0 * attempt)
+
+    #     # Ensure process is up (pgrep name)
+    #     start_retries = 6
+    #     for i in range(start_retries):
+    #         out, err  = self.exec_command(
+    #             node=node,
+    #             command=f"pgrep -fa 'fio.*{name}' || true",
+    #             max_retries=1,
+    #         )
+    #         if out.strip():
+    #             self.logger.info(f"FIO is running for {name}: {out.strip().splitlines()[0]}")
+    #             return
+    #         # Not running yet → small backoff and try again
+    #         time.sleep(2 + i)
+    #         # If still not, try re-launch quickly
+    #         if i >= 2:
+    #             self.logger.warning(f"FIO still not running for {name}; re-issuing start (try {i-1}/{start_retries-3})")
+    #             try:
+    #                 self.exec_command(node=node, command=f"sudo {command}", max_retries=1)
+    #             except Exception as e:
+    #                 self.logger.warning(f"Re-start attempt raised: {e}")
+
+    #     # If we get here, fio didn’t stick
+    #     raise RuntimeError(f"FIO failed to stay running for job {name} on {node}")
+
+        # def _is_running():
+        #     # Use pgrep on job name (fio --name=<name>) for a quick check
+        #     # Fall back to ps+grep if pgrep not present.
+        #     try:
+        #         out, _ = self.exec_command(node=node, command=f"pgrep -fl 'fio.*--name={name}'", max_retries=1)
+        #         return bool(out.strip())
+        #     except Exception:
+        #         out, _ = self.exec_command(node=node, command=f"ps ax | grep -E 'fio.*--name={name}' | grep -v grep || true", max_retries=1)
+        #         return bool(out.strip())
+
+        # # Try to start; handle EOF/channel close by reconnect+retry
+        # attempts = 0
+        # while True:
+        #     attempts += 1
+        #     try:
+        #         self.exec_command(node=node, command=command, max_retries=3)
+        #     except Exception as e:
+        #         # Channel/EOF during start is common in churn; retry a few times
+        #         if attempts < max_start_retries:
+        #             self.logger.error(f"FIO start error ({e}); retrying {attempts}/{max_start_retries} in 2s")
+        #             time.sleep(2)
+        #             continue
+        #         else:
+        #             raise
+
+        #     if not ensure_running:
+        #         return
+
+        #     # Verify started; retry if not
+        #     time.sleep(1.0)
+        #     if _is_running():
+        #         return
+
+        #     if attempts >= max_start_retries:
+        #         raise RuntimeError(f"FIO failed to start after {max_start_retries} attempts for job '{name}'")
+
+        #     self.logger.warning(f"FIO not detected running for '{name}'; retrying start {attempts}/{max_start_retries}")
+        #     time.sleep(1.0)
 
-        Args:
-            node (str): Node to perform ssh operation on
-            device (str): Device path. Defaults to None.
-            directory (str, optional): Directory to run test on. Defaults to None.
-            log_file (str, optional): Log file to redirect output to. Defaults to None.
+    def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwargs):
+        """
+        Start FIO in a detached tmux session so it survives SSH channel drops during fast outages.
+        Verifies process presence and re-kicks a few times if missing.
         """
         location = ""
         if device:
@@ -377,72 +757,63 @@ def run_fio_test(self, node, device=None, directory=None, log_file=None, **kwarg
         if directory:
             location = f"--directory={directory}"
 
-        runtime = kwargs.get("runtime", 3600)
-        rw = kwargs.get("rw", "randrw")
-        name = kwargs.get("name", "test")
-        ioengine = kwargs.get("ioengine", "libaio")
-        iodepth = kwargs.get("iodepth", 1)
-        bs = kwargs.get("bs", "4k")
-        rwmixread = kwargs.get("rwmixread", 70)
-        size = kwargs.get("size", "10MiB")
-        time_based = "--time_based" if kwargs.get("time_based", True) else ""
-        numjobs = kwargs.get("numjobs", 1)
-        nrfiles = kwargs.get("nrfiles", 1)
-
-        output_format = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else ''
+        runtime     = kwargs.get("runtime", 3600)
+        name        = kwargs.get("name", f"fio_{_rid(6)}")
+        ioengine    = kwargs.get("ioengine", "libaio")
+        iodepth     = kwargs.get("iodepth", 1)
+        time_based  = "--time_based" if kwargs.get("time_based", True) else ""
+        rw          = kwargs.get("rw", "randrw")
+        bs          = kwargs.get("bs", "4K")
+        size        = kwargs.get("size", "1G")
+        rwmixread   = kwargs.get("rwmixread", 70)
+        numjobs     = kwargs.get("numjobs", 2)
+        nrfiles     = kwargs.get("nrfiles", 8)
+        log_avg_ms  = kwargs.get("log_avg_msec", 1000)
+        max_latency  = kwargs.get("max_latency", "20s")
+        use_latency = kwargs.get("use_latency", True)
+        output_fmt  = f' --output-format={kwargs["output_format"]} ' if kwargs.get("output_format") else ''
         output_file = f" --output={kwargs['output_file']} " if kwargs.get("output_file") else ''
+        iolog_base  = kwargs.get("iolog_file")
 
-        log_avg_msec = kwargs.get("log_avg_msec", 1000)
-        log_avg_msec_opt = f"--log_avg_msec={log_avg_msec}" if log_avg_msec else ""
-
-        iolog_base = kwargs.get("iolog_file", None)
-        iolog_opt = f"--write_iolog={iolog_base}" if iolog_base else ""
-        verify_md5 = "--verify=md5" if iodepth == 1 else ""
+        iolog_opt   = f"--write_iolog={iolog_base}" if iolog_base else ""
+        log_opt     = f"--log_avg_msec={log_avg_ms}" if log_avg_ms else ""
+        latency = f" --max_latency={max_latency}" if use_latency else ""
 
-        command = (
-            f"sudo fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} "
-            f"{time_based} --runtime={runtime} --rw={rw} --max_latency=30s --bs={bs} --size={size} --rwmixread={rwmixread} "
-            f"{verify_md5} --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} "
-            f"{log_avg_msec_opt} {iolog_opt} "
-            f"{output_format}{output_file}"
-        )
-        # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        # log_file = log_file or f"/tmp/{name}_{timestamp}.log"
+        # raw fio command
+        fio_cmd = (
+            f"fio --name={name} {location} --ioengine={ioengine} --direct=1 --iodepth={iodepth} "
+            f"{time_based} --runtime={runtime} --rw={rw} {latency} --bs={bs} --size={size} --rwmixread={rwmixread} "
+            f"--verify=md5 --verify_dump=1 --verify_fatal=1 --numjobs={numjobs} --nrfiles={nrfiles} "
+            f"{log_opt} {iolog_opt} {output_fmt}{output_file}"
+        ).strip()
 
         if kwargs.get("debug"):
-            command += " --debug=all"
+            fio_cmd += " --debug=all"
 
+        # run fio under tmux so HUP/SSH channel drops don't kill it
+        session = f"fio_{name}"
         if log_file:
-            command += f" > {log_file} 2>&1"
-        
-        # else:
-        #     command += " --debug=verify"
-        
-        # awk_ts = " | awk '{ print strftime(\"[%Y-%m-%d %H:%M:%S]\"), $0; fflush(); }' | "
-        # command += awk_ts
-        # command += f"tee {log_file}"
-
-        self.logger.info(f"Executing FIO command:\n{command}")
+            fio_cmd = f"{fio_cmd} > {log_file} 2>&1"
+
+        start_cmd = f"sudo tmux new-session -d -s {session} \"{fio_cmd}\" || sudo tmux kill-session -t {session} 2>/dev/null || true; sudo tmux new-session -d -s {session} \"{fio_cmd}\""
+        self.logger.info(f"Starting FIO on {node}: {name} in tmux session '{session}'")
+        self.exec_command(node=node, command=start_cmd, max_retries=2)
+
+        # Ensure process is up: check tmux & pgrep
+        for i in range(8):
+            out, _ = self.exec_command(node=node, command=f"pgrep -fa 'fio.*{name}' || true", max_retries=1, supress_logs=True)
+            tmux_ok, _ = self.exec_command(node=node, command=f"sudo tmux has-session -t {session} 2>/dev/null || echo MISSING", max_retries=1, supress_logs=True)
+            if out.strip() and "MISSING" not in tmux_ok:
+                self.logger.info(f"FIO is running for {name}: {out.strip().splitlines()[0]}")
+                return
+            if i >= 2:
+                self.logger.warning(f"FIO not detected yet for {name}; re-issuing start (try {i-1}/5)")
+                self.exec_command(node=node, command=start_cmd, max_retries=1, supress_logs=True)
+            time.sleep(2 + i)
 
-        start_time = time.time()
-        output, error = self.exec_command(node=node, command=command, timeout=runtime * 2)
-        end_time = time.time()
-
-        total_time = end_time - start_time
-        self.fio_runtime[name] = start_time
-        self.logger.info(f"Total time taken to run the command: {total_time:.2f} seconds")
-
-        # Return all generated iolog files (one per job)
-        iolog_files = [f"{iolog_base}.{i}" for i in range(numjobs)]
-        return {
-            "output": output,
-            "error": error,
-            "start_time": start_time,
-            "end_time": end_time,
-            "iolog_files": iolog_files,
-        }
+        raise RuntimeError(f"FIO failed to stay running for job {name} on {node}")
 
-    
+        
     def find_process_name(self, node, process_name, return_pid=False):
         if return_pid:
             command = "ps -ef | grep -i '%s' | awk '{print $2}'" % process_name
@@ -700,15 +1071,35 @@ def get_lvol_id(self, node, lvol_name):
         return output.strip().split()
     
     def get_snapshot_id(self, node, snapshot_name):
-        cmd = "%s snapshot list | grep -i '%s ' | awk '{print $2}'" % (self.base_cmd, snapshot_name)
-        output, error = self.exec_command(node=node, command=cmd)
+        start = time.time()
+        deadline = start + 600  # 10 minutes
+        wait_interval = 10       # seconds between checks
+        snapshot_id = ""
+
+        while time.time() < deadline:
+            cmd = "%s snapshot list | grep -i '%s ' | awk '{print $2}'" % (self.base_cmd, snapshot_name)
+            output, error = self.exec_command(node=node, command=cmd)
+            if output.strip():
+                if hasattr(self, "logger"):
+                    self.logger.info(f"Snapshot '{snapshot_name}' is visible with ID: {snapshot_id}")
+                break
+            time.sleep(wait_interval)
+
+        if not output.strip():
+            if hasattr(self, "logger"):
+                self.logger.error(f"Timed out waiting for snapshot '{snapshot_name}' to appear within 10 minutes.")
 
         return output.strip()
 
     def add_snapshot(self, node, lvol_id, snapshot_name):
         cmd = f"{self.base_cmd} -d snapshot add {lvol_id} {snapshot_name}"
         output, error = self.exec_command(node=node, command=cmd)
-        return output, error
+
+        snapshot_id = self.get_snapshot_id(node=node, snapshot_name=snapshot_name)
+
+        if not snapshot_id:
+            if hasattr(self, "logger"):
+                self.logger.error(f"Timed out waiting for snapshot '{snapshot_name}' to appear within 10 minutes.")
     
     def add_clone(self, node, snapshot_id, clone_name):
         cmd = f"{self.base_cmd} -d snapshot clone {snapshot_id} {clone_name}"
@@ -971,30 +1362,81 @@ def get_active_interfaces(self, node_ip):
             return []
         
 
-    def disconnect_all_active_interfaces(self, node_ip, interfaces, reconnect_time=300):
-        """
-        Disconnect all active network interfaces on a node in a single SSH call.
+    # def disconnect_all_active_interfaces(self, node_ip, interfaces, reconnect_time=300):
+    #     """
+    #     Disconnect all active network interfaces on a node in a single SSH call.
+
+    #     Args:
+    #         node_ip (str): IP of the target node.
+    #         interfaces (list): List of active network interfaces to disconnect.
+    #     """
+    #     if not interfaces:
+    #         self.logger.warning(f"No active interfaces to disconnect on node {node_ip}.")
+    #         return
+
+    #     # Combine disconnect commands for all interfaces
+    #     disconnect_cmds = " && ".join([f"sudo nmcli connection down {iface}" for iface in interfaces])
+    #     reconnect_cmds = " && ".join([f"sudo nmcli connection up {iface}" for iface in interfaces])
+
+    #     cmd = (
+    #         f'nohup sh -c "{disconnect_cmds} && sleep {reconnect_time} && {reconnect_cmds}" &'
+    #     )
+    #     self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}")
+    #     try:
+    #         self.exec_command(node_ip, cmd)
+    #     except Exception as e:
+    #         self.logger.error(f"Failed to execute combined disconnect command on {node_ip}: {e}")
+
+    def _ping_once(self, ip: str, count: int = 1, wait: int = 1) -> bool:
+        try:
+            # Use system ping; True means "ping success"
+            res = subprocess.run(["ping", "-c", str(count), "-W", str(wait), ip],
+                                 stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            return res.returncode == 0
+        except Exception:
+            return False
 
-        Args:
-            node_ip (str): IP of the target node.
-            interfaces (list): List of active network interfaces to disconnect.
+    def disconnect_all_active_interfaces(
+        self,
+        node_ip: str,
+        interfaces: list[str],
+        duration_secs: int = 300,
+        max_tries: int = 3,
+    ):
+        """
+        Bring all given interfaces DOWN, verify outage by ping, keep for duration, then bring them UP.
+        Fire-and-forget style; robust against brief SSH flaps.
         """
         if not interfaces:
-            self.logger.warning(f"No active interfaces to disconnect on node {node_ip}.")
+            self.logger.info(f"No active interfaces provided for {node_ip}; skipping NIC down.")
             return
 
-        # Combine disconnect commands for all interfaces
-        disconnect_cmds = " && ".join([f"sudo nmcli connection down {iface}" for iface in interfaces])
-        reconnect_cmds = " && ".join([f"sudo nmcli connection up {iface}" for iface in interfaces])
+        down_cmd = " && ".join([f"nmcli connection down {i}" for i in interfaces])
+        up_cmd   = " && ".join([f"nmcli connection up {i}" for i in interfaces])
+        cmd = f'nohup sh -c "{down_cmd} && sleep {duration_secs} && {up_cmd}" &'
 
-        cmd = (
-            f'nohup sh -c "{disconnect_cmds} && sleep {reconnect_time} && {reconnect_cmds}" &'
-        )
-        self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}")
         try:
-            self.exec_command(node_ip, cmd)
+            self.logger.info(f"Executing combined disconnect command on node {node_ip}: {cmd}")
+            out, err = self.exec_command(node=node_ip, command=cmd, max_retries=1, timeout=20)
+            if err:
+                raise Exception(err)
         except Exception as e:
-            self.logger.error(f"Failed to execute combined disconnect command on {node_ip}: {e}")
+            self.logger.info(f"Command: {cmd}, error: {e}! Checking pings!!")
+
+        # Verify outage begins (best-effort). If ping still works, attempt to issue 'down' again.
+        time.sleep(5)
+        tries = 0
+        attempts = 10
+        while self._ping_once(node_ip) and attempts > 0:
+            tries += 1
+            if tries >= max_tries:
+                self.logger.warning(f"Ping to {node_ip} still responding after NIC down attempts; continuing anyway.")
+                break
+            self.logger.info(f"Ping to {node_ip} still alive; retrying NIC down...")
+            # re-run only the DOWN part (don’t append sleep again to avoid stacking)
+            self.exec_command(node=node_ip, command=cmd, max_retries=2)
+            time.sleep(3)
+            attempts -= 1
 
     def check_tmux_installed(self, node_ip):
         """Check tmux installation
@@ -1420,132 +1862,263 @@ def dump_lvstore(self, node_ip, storage_node_id):
             self.logger.error(f"Failed to dump lvstore on {node_ip}: {e}")
             return None
         
-    def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path):
-        """
-        Fetch distrib names using bdev_get_bdevs RPC, generate and execute RPC JSON,
-        and copy logs from SPDK container.
+    # def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path):
+    #     """
+    #     Fetch distrib names using bdev_get_bdevs RPC, generate and execute RPC JSON,
+    #     and copy logs from SPDK container.
+
+    #     Args:
+    #         storage_node_ip (str): IP of the storage node
+    #         storage_node_id (str): ID of the storage node
+    #     """
+    #     self.logger.info(f"Fetching distrib logs for Storage Node ID: {storage_node_id} on {storage_node_ip}")
+
+    #     # Step 1: Find the SPDK container
+    #     find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$'"
+    #     container_name_output, _ = self.exec_command(storage_node_ip, find_container_cmd)
+    #     container_name = container_name_output.strip()
+
+    #     if not container_name:
+    #         self.logger.warning(f"No SPDK container found on {storage_node_ip}")
+    #         return
+
+    #     # Step 2: Get bdev_get_bdevs output
+    #     # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'"
+    #     # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd)
+
+    #     # if error:
+    #     #     self.logger.error(f"Error running bdev_get_bdevs: {error}")
+    #     #     return
+
+    #     # # Step 3: Save full output to local file
+    #     # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S")
+    #     # raw_output_path = f"{Path.home()}/bdev_output_{storage_node_ip}_{timestamp}.json"
+    #     # with open(raw_output_path, "w") as f:
+    #     #     f.write(bdev_output)
+    #     # self.logger.info(f"Saved raw bdev_get_bdevs output to {raw_output_path}")
+
+    #     timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S")
+    #     base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs/"
+
+    #     cmd = f"sudo mkdir -p '{base_path}'"
+    #     self.exec_command(storage_node_ip, cmd)
+
+    #     remote_output_path = f"bdev_output_{storage_node_ip}_{timestamp}.json"
+
+    #     # 1. Run to capture output into a variable (for parsing)
+    #     bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs'"
+    #     bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd)
+
+    #     if error:
+    #         self.logger.error(f"Error running bdev_get_bdevs: {error}")
+    #         return
+
+    #     # 2. Run again to save output on host machine (audit trail)
+    #     bdev_save_cmd = (
+    #         f"sudo bash -c \"docker exec {container_name} python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs > {remote_output_path}\"")
+
+    #     self.exec_command(storage_node_ip, bdev_save_cmd)
+    #     self.logger.info(f"Saved bdev_get_bdevs output to {remote_output_path} on {storage_node_ip}")
+
+
+    #     # Step 4: Extract unique distrib names
+    #     try:
+    #         bdevs = json.loads(bdev_output)
+    #         distribs = list({bdev['name'] for bdev in bdevs if bdev['name'].startswith('distrib_')})
+    #     except json.JSONDecodeError as e:
+    #         self.logger.error(f"JSON parsing failed: {e}")
+    #         return
+
+    #     if not distribs:
+    #         self.logger.warning("No distrib names found in bdev_get_bdevs output.")
+    #         return
+
+    #     self.logger.info(f"Distributions found: {distribs}")
+
+    #     # Step 5: Process each distrib
+    #     for distrib in distribs:
+    #         self.logger.info(f"Processing distrib: {distrib}")
+    #         rpc_json = {
+    #             "subsystems": [
+    #                 {
+    #                     "subsystem": "distr",
+    #                     "config": [
+    #                         {
+    #                             "method": "distr_debug_placement_map_dump",
+    #                             "params": {"name": distrib}
+    #                         }
+    #                     ]
+    #                 }
+    #             ]
+    #         }
+
+    #         rpc_json_str = json.dumps(rpc_json)
+    #         remote_json_path = "/tmp/stack.json"
+
+    #         # Save JSON file remotely
+    #         create_json_command = f"echo '{rpc_json_str}' | sudo tee {remote_json_path}"
+    #         self.exec_command(storage_node_ip, create_json_command)
+
+    #         # Copy into container
+    #         copy_json_command = f"sudo docker cp {remote_json_path} {container_name}:{remote_json_path}"
+    #         self.exec_command(storage_node_ip, copy_json_command)
+
+    #         # Run RPC inside container
+    #         rpc_command = f"sudo docker exec {container_name} bash -c 'python scripts/rpc_sock.py {remote_json_path} /mnt/ramdisk/{container_name}/spdk.sock'"
+    #         self.exec_command(storage_node_ip, rpc_command)
+
+    #         # Find and copy log
+    #         find_log_command = f"sudo docker exec {container_name} ls /tmp/ | grep {distrib}"
+    #         log_file_name, _ = self.exec_command(storage_node_ip, find_log_command)
+    #         log_file_name = log_file_name.strip().replace("\r", "").replace("\n", "")
+
+    #         if not log_file_name:
+    #             self.logger.error(f"No log file found for distrib {distrib}.")
+    #             continue
+
+    #         log_file_path = f"/tmp/{log_file_name}"
+    #         local_log_path = f"{base_path}/{log_file_name}_{storage_node_ip}_{timestamp}"
+    #         copy_log_cmd = f"sudo docker cp {container_name}:{log_file_path} {local_log_path}"
+    #         self.exec_command(storage_node_ip, copy_log_cmd)
+
+    #         self.logger.info(f"Fetched log for {distrib}: {local_log_path}")
+
+    #         # Clean up
+    #         delete_log_cmd = f"sudo docker exec {container_name} rm -f {log_file_path}"
+    #         self.exec_command(storage_node_ip, delete_log_cmd)
+
+    #     self.logger.info("All distrib logs retrieved successfully.")
 
-        Args:
-            storage_node_ip (str): IP of the storage node
-            storage_node_id (str): ID of the storage node
-        """
+    def fetch_distrib_logs(self, storage_node_ip, storage_node_id, logs_path):
         self.logger.info(f"Fetching distrib logs for Storage Node ID: {storage_node_id} on {storage_node_ip}")
 
-        # Step 1: Find the SPDK container
-        find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$'"
-        container_name_output, _ = self.exec_command(storage_node_ip, find_container_cmd)
-        container_name = container_name_output.strip()
-
+        # 0) Find SPDK container name
+        find_container_cmd = "sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' || true"
+        container_name_out, _ = self.exec_command(storage_node_ip, find_container_cmd)
+        container_name = (container_name_out or "").strip()
         if not container_name:
             self.logger.warning(f"No SPDK container found on {storage_node_ip}")
             return
 
-        # Step 2: Get bdev_get_bdevs output
-        # bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'"
-        # bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd)
-
-        # if error:
-        #     self.logger.error(f"Error running bdev_get_bdevs: {error}")
-        #     return
-
-        # # Step 3: Save full output to local file
-        # timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S")
-        # raw_output_path = f"{Path.home()}/bdev_output_{storage_node_ip}_{timestamp}.json"
-        # with open(raw_output_path, "w") as f:
-        #     f.write(bdev_output)
-        # self.logger.info(f"Saved raw bdev_get_bdevs output to {raw_output_path}")
-
-        timestamp = datetime.now().strftime("%d-%m-%y-%H-%M-%S")
-        base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs/"
-
-        cmd = f"sudo mkdir -p '{base_path}'"
-        self.exec_command(storage_node_ip, cmd)
-
-        remote_output_path = f"bdev_output_{storage_node_ip}_{timestamp}.json"
-
-        # 1. Run to capture output into a variable (for parsing)
-        bdev_cmd = f"sudo docker exec {container_name} bash -c 'python spdk/scripts/rpc.py bdev_get_bdevs'"
-        bdev_output, error = self.exec_command(storage_node_ip, bdev_cmd)
-
-        if error:
-            self.logger.error(f"Error running bdev_get_bdevs: {error}")
+        # 1) Get bdevs via correct sock
+        timestamp = datetime.now().strftime("%Y%m%d_%H-%M-%S")
+        base_path = f"{logs_path}/{storage_node_ip}/distrib_bdev_logs"
+        self.exec_command(storage_node_ip, f"sudo mkdir -p '{base_path}' && sudo chmod -R 777 '{base_path}'")
+        bdev_cmd = (
+            f"sudo docker exec {container_name} bash -lc "
+            f"\"python spdk/scripts/rpc.py -s /mnt/ramdisk/{container_name}/spdk.sock bdev_get_bdevs\""
+        )
+        bdev_output, bdev_err = self.exec_command(storage_node_ip, bdev_cmd)
+        if (bdev_err and bdev_err.strip()) and not bdev_output:
+            self.logger.error(f"bdev_get_bdevs error on {storage_node_ip}: {bdev_err.strip()}")
             return
 
-        # 2. Run again to save output on host machine (audit trail)
-        bdev_save_cmd = (
-            f"sudo bash -c \"docker exec {container_name} python spdk/scripts/rpc.py bdev_get_bdevs > {remote_output_path}\"")
-
-        self.exec_command(storage_node_ip, bdev_save_cmd)
-        self.logger.info(f"Saved bdev_get_bdevs output to {remote_output_path} on {storage_node_ip}")
-
-
-        # Step 4: Extract unique distrib names
+        # Parse distrib names
         try:
             bdevs = json.loads(bdev_output)
-            distribs = list({bdev['name'] for bdev in bdevs if bdev['name'].startswith('distrib_')})
+            distribs = sorted({
+                b.get("name", "")
+                for b in bdevs
+                if isinstance(b, dict) and str(b.get("name","")).startswith("distrib_")
+            })
         except json.JSONDecodeError as e:
-            self.logger.error(f"JSON parsing failed: {e}")
+            self.logger.error(f"JSON parsing failed on {storage_node_ip}: {e}")
             return
-
         if not distribs:
-            self.logger.warning("No distrib names found in bdev_get_bdevs output.")
+            self.logger.warning(f"No distrib_* bdevs found on {storage_node_ip}.")
+            return
+        self.logger.info(f"[{storage_node_ip}] Distributions: {distribs}")
+
+        # 2) Run multiple docker exec in parallel from ONE SSH exec
+        distrib_list_str = " ".join(shlex.quote(d) for d in distribs)
+        remote_tar = f"/tmp/distrib_logs_{timestamp}.tar.gz"
+
+        # IMPORTANT: This script runs on the HOST and spawns many `docker exec ... &` in parallel.
+        # It throttles with MAXJ, waits, then tars outputs from /tmp inside the container into one tarball on the host.
+        remote_script = f"""\
+set -euo pipefail
+CN={shlex.quote(container_name)}
+SOCK="/mnt/ramdisk/$CN/spdk.sock"
+TS="{timestamp}"
+MAXJ=8
+WORKDIR_HOST="{base_path}"
+mkdir -p "$WORKDIR_HOST"
+
+# Make a temporary host folder to collect per-distrib files copied out of the container
+HOST_STAGING="/tmp/distrib_host_collect_$TS"
+mkdir -p "$HOST_STAGING"
+
+pids=()
+
+for d in {distrib_list_str}; do
+  (
+    # Build JSON on host then copy into container (avoids many ssh execs)
+    JF="/tmp/stack_${{d}}.json"
+    cat > "$JF" <<'EOF_JSON'
+{{
+  "subsystems": [
+    {{
+      "subsystem": "distr",
+      "config": [
+        {{
+          "method": "distr_debug_placement_map_dump",
+          "params": {{"name": "__DIST__"}}
+        }}
+      ]
+    }}
+  ]
+}}
+EOF_JSON
+    # substitute distrib name
+    sed -i "s/__DIST__/$d/g" "$JF"
+
+    # Copy JSON into container
+    sudo docker cp "$JF" "$CN:/tmp/stack_${{d}}.json"
+
+    # Run rpc inside container (socket path respected)
+    sudo docker exec "$CN" bash -lc "python scripts/rpc_sock.py /tmp/stack_${{d}}.json {shlex.quote('/mnt/ramdisk/'+container_name+'/spdk.sock')} > /tmp/rpc_${{d}}.log 2>&1 || true"
+
+    # Copy any files for this distrib out to host staging (rpc log + any matching /tmp/*d*)
+    sudo docker cp "$CN:/tmp/rpc_${{d}}.log" "$HOST_STAGING/rpc_${{d}}.log" 2>/dev/null || true
+    # try to pull any distrib-related artifacts
+    for f in $(sudo docker exec "$CN" bash -lc "ls /tmp/ 2>/dev/null | grep -F \"$d\" || true"); do
+      sudo docker cp "$CN:/tmp/$f" "$HOST_STAGING/$f" 2>/dev/null || true
+    done
+
+    # cleanup container temp for this distrib
+    sudo docker exec "$CN" bash -lc "rm -f /tmp/stack_${{d}}.json /tmp/rpc_${{d}}.log" || true
+    rm -f "$JF" || true
+  ) &
+
+  # throttle parallel jobs
+  while [ "$(jobs -rp | wc -l)" -ge "$MAXJ" ]; do sleep 0.2; done
+done
+
+# Wait for all background jobs
+wait
+
+# Tar once on host
+tar -C "$HOST_STAGING" -czf {shlex.quote(remote_tar)} . 2>/dev/null || true
+
+# Move artifacts to final location
+mv -f {shlex.quote(remote_tar)} "$WORKDIR_HOST/" || true
+
+# Also copy loose files (for convenience) then clean staging
+cp -rf "$HOST_STAGING"/. "$WORKDIR_HOST"/ 2>/dev/null || true
+rm -rf "$HOST_STAGING" || true
+
+echo "$WORKDIR_HOST/{os.path.basename(remote_tar)}"
+"""
+
+        run_many_cmd = "bash -lc " + shlex.quote(remote_script)
+        tar_out, tar_err = self.exec_command(storage_node_ip, run_many_cmd)
+        if (tar_err and tar_err.strip()) and not tar_out:
+            self.logger.error(f"[{storage_node_ip}] Parallel docker-exec script error: {tar_err.strip()}")
             return
 
-        self.logger.info(f"Distributions found: {distribs}")
-
-        # Step 5: Process each distrib
-        for distrib in distribs:
-            self.logger.info(f"Processing distrib: {distrib}")
-            rpc_json = {
-                "subsystems": [
-                    {
-                        "subsystem": "distr",
-                        "config": [
-                            {
-                                "method": "distr_debug_placement_map_dump",
-                                "params": {"name": distrib}
-                            }
-                        ]
-                    }
-                ]
-            }
-
-            rpc_json_str = json.dumps(rpc_json)
-            remote_json_path = "/tmp/stack.json"
-
-            # Save JSON file remotely
-            create_json_command = f"echo '{rpc_json_str}' | sudo tee {remote_json_path}"
-            self.exec_command(storage_node_ip, create_json_command)
-
-            # Copy into container
-            copy_json_command = f"sudo docker cp {remote_json_path} {container_name}:{remote_json_path}"
-            self.exec_command(storage_node_ip, copy_json_command)
-
-            # Run RPC inside container
-            rpc_command = f"sudo docker exec {container_name} bash -c 'python scripts/rpc_sock.py {remote_json_path}'"
-            self.exec_command(storage_node_ip, rpc_command)
-
-            # Find and copy log
-            find_log_command = f"sudo docker exec {container_name} ls /tmp/ | grep {distrib}"
-            log_file_name, _ = self.exec_command(storage_node_ip, find_log_command)
-            log_file_name = log_file_name.strip().replace("\r", "").replace("\n", "")
-
-            if not log_file_name:
-                self.logger.error(f"No log file found for distrib {distrib}.")
-                continue
-
-            log_file_path = f"/tmp/{log_file_name}"
-            local_log_path = f"{base_path}/{log_file_name}_{storage_node_ip}_{timestamp}"
-            copy_log_cmd = f"sudo docker cp {container_name}:{log_file_path} {local_log_path}"
-            self.exec_command(storage_node_ip, copy_log_cmd)
-
-            self.logger.info(f"Fetched log for {distrib}: {local_log_path}")
-
-            # Clean up
-            delete_log_cmd = f"sudo docker exec {container_name} rm -f {log_file_path}"
-            self.exec_command(storage_node_ip, delete_log_cmd)
+        final_tar = (tar_out or "").strip().splitlines()[-1] if tar_out else f"{base_path}/{os.path.basename(remote_tar)}"
+        self.logger.info(f"[{storage_node_ip}] Distrib logs saved: {base_path} (tar: {final_tar})")
 
-        self.logger.info("All distrib logs retrieved successfully.")
 
     def clone_mount_gen_uuid(self, node, device):
         """Repair the XFS filesystem and generate a new UUID.
@@ -1722,8 +2295,8 @@ def start_netstat_dmesg_logging(self, node_ip, log_dir):
 
         self.exec_command(node_ip, f"sudo tmux new-session -d -s netstat_log 'bash -c \"while true; do netstat -s | grep \\\"segments dropped\\\" >> {netstat_log}; sleep 5; done\"'")
         self.exec_command(node_ip, f"sudo tmux new-session -d -s dmesg_log 'bash -c \"while true; do sudo dmesg | grep -i \\\"tcp\\\" >> {dmesg_log}; sleep 5; done\"'")
-        self.exec_command(node_ip, f"sudo tmux new-session -d -s journalctl_log 'bash -c \"while true; do sudo journalctl -k | grep -i \\\"tcp\\\" >> {journalctl_log}; sleep 5; done\"'")
-
+        self.exec_command(node_ip, f"sudo tmux new-session -d -s journalctl_log 'bash -c \"while true; do sudo journalctl -k --no-tail | grep -i \\\"tcp\\\" >> {journalctl_log}; sleep 5; done\"'")
+                
     def reset_iptables_in_spdk(self, node_ip):
         """
         Resets iptables rules inside the SPDK container on a given node.
@@ -1915,6 +2488,7 @@ def start_resource_monitors(self, node_ip, log_dir):
         root_log = f"{log_dir}/root_partition_usage_{node_ip}_{timestamp}.txt"
         docker_mem_log = f"{log_dir}/docker_mem_usage_{node_ip}_{timestamp}.txt"
         system_mem_log = f"{log_dir}/system_memory_usage_{node_ip}_{timestamp}.txt"
+        docker_stats_logs = f"{log_dir}/docker_stats_usage_{node_ip}_{timestamp}.txt"
 
         # Ensure log directory exists and is writable
         self.exec_command(node_ip, f"sudo mkdir -p {log_dir} && sudo chmod 777 {log_dir}")
@@ -1939,14 +2513,29 @@ def start_resource_monitors(self, node_ip, log_dir):
         'bash -c "while true; do date >> {system_mem_log}; free -h >> {system_mem_log}; echo >> {system_mem_log}; sleep 10; done"'
         """
 
+        docker_stats_cmd = f"""
+        sudo tmux new-session -d -s docker_stats_all \
+        'bash -c "while true; do date >> {docker_stats_logs}; docker stats --no-stream >> {docker_stats_logs}; echo >> {docker_stats_logs}; sleep 10; done"'
+        """
+
         self.exec_command(node_ip, df_cmd)
         self.exec_command(node_ip, docker_cmd)
         self.exec_command(node_ip, system_cmd)
+        self.exec_command(node_ip, docker_stats_cmd)
 
-        self.logger.info(f"Started root partition, container memory, and system memory logging on {node_ip}")
+        self.logger.info(f"Started root partition, container memory, docker stats and system memory logging on {node_ip}")
+    
+    def cluster_list(self, node_ip, cluster_id):
+        """Sets cluster in suspended state
 
+        Args:
+            node_ip (str): Mgmt Node IP to run command on
+            cluster_id (str): Cluster id to put in suspended state
+        """
+        cmd = f"{self.base_cmd} cluster list"
+        output, _ = self.exec_command(node_ip, cmd)
+        return output.strip()
 
-    
     def suspend_cluster(self, node_ip, cluster_id):
         """Sets cluster in suspended state
 
@@ -1995,7 +2584,7 @@ def ensure_nfs_mounted(self, node, nfs_server, nfs_path, mount_point, is_local =
         """
         check_cmd = f"mount | grep -w '{mount_point}'"
         mount_cmd = f"sudo mkdir -p {mount_point} && sudo mount -t nfs {nfs_server}:{nfs_path} {mount_point}"
-        install_check_cmd = "dnf list installed nfs-util"
+        install_check_cmd = "dnf list installed nfs-utils"
         install_cmd = "sudo dnf install -y nfs-utils"
 
         try:
@@ -2300,3 +2889,10 @@ def stop_log_monitor(self):
             self._monitor_stop_flag.set()
             self._monitor_thread.join(timeout=10)
             print("K8s log monitor thread stopped.")
+
+def _rid(n=6):
+    import string
+    import random
+    letters = string.ascii_uppercase
+    digits = string.digits
+    return random.choice(letters) + ''.join(random.choices(letters + digits, k=n-1))
diff --git a/requirements.txt b/requirements.txt
index 030cca8e0..9ee458f00 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,3 +24,4 @@ flask-openapi3
 jsonschema
 fastapi
 uvicorn
+prometheus_api_client
\ No newline at end of file
diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py
index ff07e6634..5e6352cc0 100644
--- a/simplyblock_core/cluster_ops.py
+++ b/simplyblock_core/cluster_ops.py
@@ -25,6 +25,7 @@
 from simplyblock_core.models.stats import LVolStatObject, ClusterStatObject, NodeStatObject, DeviceStatObject
 from simplyblock_core.models.nvme_device import NVMeDevice
 from simplyblock_core.models.storage_node import StorageNode
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.utils import pull_docker_image_with_retry
 
 logger = utils.get_logger(__name__)
@@ -281,9 +282,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass,
         if not dev_ip:
             raise ValueError("Error getting ip: For Kubernetes-based deployments, please supply --mgmt-ip.")
 
-        current_node = utils.get_node_name_by_ip(dev_ip)
-        utils.label_node_as_mgmt_plane(current_node)
-
     if not cli_pass:
         cli_pass = utils.generate_string(10)
 
@@ -435,18 +433,23 @@ def _run_fio(mount_point) -> None:
 
 def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit,
                 distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity, qpair_count,
-                max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric="tcp") -> str:
+                max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric="tcp",
+                cluster_ip=None, grafana_secret=None) -> str:
+
 
+    default_cluster = None
+    monitoring_secret = os.environ.get("MONITORING_SECRET", "")
     clusters = db_controller.get_clusters()
-    if not clusters:
-        raise ValueError("No previous clusters found!")
+    if clusters:
+        default_cluster = clusters[0]
+    else:
+        logger.info("No previous clusters found")
 
     if distr_ndcs == 0 and distr_npcs == 0:
         raise ValueError("both distr_ndcs and distr_npcs cannot be 0")
 
-    monitoring_secret = os.environ.get("MONITORING_SECRET", "")
-    
     logger.info("Adding new cluster")
+
     cluster = Cluster()
     cluster.uuid = str(uuid.uuid4())
     cluster.cluster_name = name
@@ -455,14 +458,30 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn
     cluster.nqn = f"{constants.CLUSTER_NQN}:{cluster.uuid}"
     cluster.secret = utils.generate_string(20)
     cluster.strict_node_anti_affinity = strict_node_anti_affinity
-
-    default_cluster = clusters[0]
-    cluster.db_connection = default_cluster.db_connection
-    cluster.grafana_secret = monitoring_secret if default_cluster.mode == "kubernetes" else default_cluster.grafana_secret
-    cluster.grafana_endpoint = default_cluster.grafana_endpoint
-
+    if default_cluster:
+        cluster.mode = default_cluster.mode
+        cluster.db_connection = default_cluster.db_connection
+        cluster.grafana_secret = grafana_secret if grafana_secret else default_cluster.grafana_secret
+        cluster.grafana_endpoint = default_cluster.grafana_endpoint
+    else:
+        # creating first cluster on k8s
+        cluster.mode = "kubernetes"
+        logger.info("Retrieving foundationdb connection string...")
+        fdb_cluster_string = utils.get_fdb_cluster_string(constants.FDB_CONFIG_NAME, constants.K8S_NAMESPACE)
+        cluster.db_connection = fdb_cluster_string
+        if monitoring_secret:
+            cluster.grafana_secret = monitoring_secret
+        else:
+            raise Exception("monitoring_secret is required")
+        cluster.grafana_endpoint = "http://simplyblock-grafana:3000"
+        if not cluster_ip:
+            cluster_ip = "0.0.0.0"
+
+        # add mgmt node object
+        mgmt_node_ops.add_mgmt_node(cluster_ip, "kubernetes", cluster.uuid)
+                   
     _create_update_user(cluster.uuid, cluster.grafana_endpoint, cluster.grafana_secret, cluster.secret)
-
+                    
     cluster.distr_ndcs = distr_ndcs
     cluster.distr_npcs = distr_npcs
     cluster.distr_bs = distr_bs
@@ -489,7 +508,6 @@ def add_cluster(blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn
     cluster.create_dt = str(datetime.datetime.now())
     cluster.write_to_db(db_controller.kv_store)
     cluster_events.cluster_create(cluster)
-    qos_controller.add_class("Default", 100, cluster.get_id())
 
     return cluster.get_id()
 
@@ -1000,16 +1018,11 @@ def list_all_info(cluster_id) -> str:
 
 
 def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]:
-    cluster = db_controller.get_cluster_by_id(cluster_id)
-
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            raise ValueError(f"Error parsing history string: {history}")
-    else:
-        records_number = 20
-
-    records = db_controller.get_cluster_capacity(cluster, records_number)
+    try:
+        _ = db_controller.get_cluster_by_id(cluster_id)
+    except KeyError:
+        logger.error(f"Cluster not found: {cluster_id}")
+        return []
 
     cap_stats_keys = [
         "date",
@@ -1020,20 +1033,17 @@ def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]:
         "size_util",
         "size_prov_util",
     ]
+    prom_client = PromClient(cluster_id)
+    records = prom_client.get_cluster_metrics(cluster_id, cap_stats_keys, history)
     return utils.process_records(records, records_count, keys=cap_stats_keys)
 
 
 def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes=False) -> t.List[dict]:
-    cluster = db_controller.get_cluster_by_id(cluster_id)
-
-    if history_string:
-        records_number = utils.parse_history_param(history_string)
-        if not records_number:
-            raise ValueError(f"Error parsing history string: {history_string}")
-    else:
-        records_number = 20
-
-    records = db_controller.get_cluster_stats(cluster, records_number)
+    try:
+        _ = db_controller.get_cluster_by_id(cluster_id)
+    except KeyError:
+        logger.error(f"Cluster not found: {cluster_id}")
+        return []
 
     io_stats_keys = [
         "date",
@@ -1071,6 +1081,9 @@ def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes
                 "write_latency_ticks",
             ]
         )
+
+    prom_client = PromClient(cluster_id)
+    records = prom_client.get_cluster_metrics(cluster_id, io_stats_keys, history_string)
     # combine records
     return utils.process_records(records, records_count, keys=io_stats_keys)
 
@@ -1183,32 +1196,43 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
                     service_names.append(service.attrs['Spec']['Name'])
 
         if "app_SnapshotMonitor" not in service_names:
-            logger.info("Creating snapshot monitor service")
-            cluster_docker.services.create(
-                image=service_image,
-                command="python simplyblock_core/services/snapshot_monitor.py",
-                name="app_SnapshotMonitor",
-                mounts=["/etc/foundationdb:/etc/foundationdb"],
-                env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"],
-                networks=["host"],
-                constraints=["node.role == manager"]
-            )
+            utils.create_docker_service(
+                cluster_docker=cluster_docker,
+                service_name="app_SnapshotMonitor",
+                service_file="python simplyblock_core/services/snapshot_monitor.py",
+                service_image=service_image)
+
+        if "app_TasksRunnerLVolSyncDelete" not in service_names:
+            utils.create_docker_service(
+                cluster_docker=cluster_docker,
+                service_name="app_TasksRunnerLVolSyncDelete",
+                service_file="python simplyblock_core/services/tasks_runner_sync_lvol_del.py",
+                service_image=service_image)
+
+        if "app_TasksRunnerJCCompResume" not in service_names:
+            utils.create_docker_service(
+                cluster_docker=cluster_docker,
+                service_name="app_TasksRunnerJCCompResume",
+                service_file="python simplyblock_core/services/tasks_runner_jc_comp.py",
+                service_image=service_image)
+
         logger.info("Done updating mgmt cluster")
 
     elif cluster.mode == "kubernetes":
         utils.load_kube_config_with_fallback()
         apps_v1 = k8s_client.AppsV1Api()
-
+        namespace = constants.K8S_NAMESPACE
         image_without_tag = constants.SIMPLY_BLOCK_DOCKER_IMAGE.split(":")[0]
         image_parts = "/".join(image_without_tag.split("/")[-2:])
         service_image = mgmt_image or constants.SIMPLY_BLOCK_DOCKER_IMAGE
-
+        deployment_names = []
         # Update Deployments
-        deployments = apps_v1.list_namespaced_deployment(namespace=constants.K8S_NAMESPACE)
+        deployments = apps_v1.list_namespaced_deployment(namespace=namespace)
         for deploy in deployments.items:
             if deploy.metadata.name == constants.ADMIN_DEPLOY_NAME:
                 logger.info(f"Skipping deployment {deploy.metadata.name}")
                 continue
+            deployment_names.append(deploy.metadata.name)
             for c in deploy.spec.template.spec.containers:
                 if image_parts in c.image:
                     logger.info(f"Updating deployment {deploy.metadata.name} image to {service_image}")
@@ -1218,12 +1242,28 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
                     deploy.spec.template.metadata.annotations = annotations
                     apps_v1.patch_namespaced_deployment(
                         name=deploy.metadata.name,
-                        namespace=constants.K8S_NAMESPACE,
+                        namespace=namespace,
                         body={"spec": {"template": deploy.spec.template}}
                     )
 
+        if "simplyblock-tasks-runner-sync-lvol-del" not in deployment_names:
+            utils.create_k8s_service(
+                namespace=namespace,
+                deployment_name="simplyblock-tasks-runner-sync-lvol-del",
+                container_name="tasks-runner-sync-lvol-del",
+                service_file="simplyblock_core/services/tasks_runner_sync_lvol_del.py",
+                container_image=service_image)
+
+        if "simplyblock-snapshot-monitor" not in deployment_names:
+            utils.create_k8s_service(
+                namespace=namespace,
+                deployment_name="simplyblock-snapshot-monitor",
+                container_name="snapshot-monitor",
+                service_file="simplyblock_core/services/snapshot_monitor.py",
+                container_image=service_image)
+
         # Update DaemonSets
-        daemonsets = apps_v1.list_namespaced_daemon_set(namespace=constants.K8S_NAMESPACE)
+        daemonsets = apps_v1.list_namespaced_daemon_set(namespace=namespace)
         for ds in daemonsets.items:
             for c in ds.spec.template.spec.containers:
                 if image_parts in c.image:
@@ -1234,7 +1274,7 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
                     ds.spec.template.metadata.annotations = annotations
                     apps_v1.patch_namespaced_daemon_set(
                         name=ds.metadata.name,
-                        namespace=constants.K8S_NAMESPACE,
+                        namespace=namespace,
                         body={"spec": {"template": ds.spec.template}}
                         )
 
diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py
index 41824c73a..36ba14a9e 100644
--- a/simplyblock_core/constants.py
+++ b/simplyblock_core/constants.py
@@ -133,7 +133,8 @@ def get_config_var(name, default=None):
 LVOL_NVME_CONNECT_NR_IO_QUEUES=3
 LVOL_NVME_KEEP_ALIVE_TO=10
 LVOL_NVME_KEEP_ALIVE_TO_TCP=7
-LVOL_NVMF_PORT_START=int(os.getenv('LVOL_NVMF_PORT_START', 9100))
+LVOL_NVMF_PORT_ENV = os.getenv("LVOL_NVMF_PORT_START", "")
+LVOL_NVMF_PORT_START = int(LVOL_NVMF_PORT_ENV) if LVOL_NVMF_PORT_ENV else 9100
 QPAIR_COUNT=32
 CLIENT_QPAIR_COUNT=3
 NVME_TIMEOUT_US=8000000
@@ -224,4 +225,4 @@ def get_config_var(name, default=None):
 
 qos_class_meta_and_migration_weight_percent = 25
 
-MIG_PARALLEL_JOBS = 16
\ No newline at end of file
+MIG_PARALLEL_JOBS = 64
\ No newline at end of file
diff --git a/simplyblock_core/controllers/cluster_events.py b/simplyblock_core/controllers/cluster_events.py
index e8e6c406e..059aea976 100644
--- a/simplyblock_core/controllers/cluster_events.py
+++ b/simplyblock_core/controllers/cluster_events.py
@@ -80,3 +80,13 @@ def cluster_delete(cluster):
         db_object=cluster,
         caused_by=ec.CAUSED_BY_CLI,
         message=f"Cluster deleted {cluster.get_id()}")
+
+
+def cluster_rebalancing_change(cluster, new_state, old_status):
+    ec.log_event_cluster(
+        cluster_id=cluster.get_id(),
+        domain=ec.DOMAIN_CLUSTER,
+        event=ec.EVENT_STATUS_CHANGE,
+        db_object=cluster,
+        caused_by=ec.CAUSED_BY_CLI,
+        message=f"Cluster rebalancing changed from {old_status} to {new_state}")
diff --git a/simplyblock_core/controllers/device_controller.py b/simplyblock_core/controllers/device_controller.py
index 8e684c942..6f7a0d9f5 100644
--- a/simplyblock_core/controllers/device_controller.py
+++ b/simplyblock_core/controllers/device_controller.py
@@ -6,6 +6,7 @@
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
 from simplyblock_core.models.storage_node import StorageNode
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.rpc_client import RPCClient
 
 
@@ -440,7 +441,7 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True):
     else:
         records_number = 20
 
-    records = db_controller.get_device_capacity(device, records_number)
+    # records = db_controller.get_device_capacity(device, records_number)
     cap_stats_keys = [
         "date",
         "size_total",
@@ -448,6 +449,8 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True):
         "size_free",
         "size_util",
     ]
+    prom_client = PromClient(device.cluster_id)
+    records = prom_client.get_device_metrics(device_id, cap_stats_keys, history)
     records_list = utils.process_records(records, records_count, keys=cap_stats_keys)
 
     if not parse_sizes:
@@ -474,15 +477,6 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True):
         logger.error("device not found")
         return False
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records_list = db_controller.get_device_stats(device, records_number)
     io_stats_keys = [
         "date",
         "read_bytes",
@@ -496,8 +490,10 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True):
         "write_io_ps",
         "write_latency_ps",
     ]
+    prom_client = PromClient(device.cluster_id)
+    records = prom_client.get_device_metrics(device_id, io_stats_keys, history)
     # combine records
-    new_records = utils.process_records(records_list, records_count, keys=io_stats_keys)
+    new_records = utils.process_records(records, records_count, keys=io_stats_keys)
 
     if not parse_sizes:
         return new_records
diff --git a/simplyblock_core/controllers/health_controller.py b/simplyblock_core/controllers/health_controller.py
index c013e2d58..94855f111 100644
--- a/simplyblock_core/controllers/health_controller.py
+++ b/simplyblock_core/controllers/health_controller.py
@@ -128,11 +128,11 @@ def _check_node_api(ip):
     return False
 
 
-def _check_spdk_process_up(ip, rpc_port):
+def _check_spdk_process_up(ip, rpc_port, cluster_id):
     try:
         snode_api = SNodeClient(f"{ip}:5000", timeout=10, retry=2)
         logger.debug(f"Node API={ip}:5000")
-        is_up, _ = snode_api.spdk_process_is_up(rpc_port)
+        is_up, _ = snode_api.spdk_process_is_up(rpc_port, cluster_id)
         logger.debug(f"SPDK is {is_up}")
         return is_up
     except Exception as e:
diff --git a/simplyblock_core/controllers/lvol_controller.py b/simplyblock_core/controllers/lvol_controller.py
index 2bee37cd9..f43ccaea3 100644
--- a/simplyblock_core/controllers/lvol_controller.py
+++ b/simplyblock_core/controllers/lvol_controller.py
@@ -15,6 +15,7 @@
 from simplyblock_core.models.pool import Pool
 from simplyblock_core.models.lvol_model import LVol
 from simplyblock_core.models.storage_node import StorageNode
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.rpc_client import RPCClient
 
 logger = lg.getLogger()
@@ -1525,19 +1526,11 @@ def get_capacity(lvol_uuid, history, records_count=20, parse_sizes=True):
     db_controller = DBController()
     try:
         lvol = db_controller.get_lvol_by_id(lvol_uuid)
+        pool = db_controller.get_pool_by_id(lvol.pool_uuid)
     except KeyError as e:
         logger.error(e)
         return False
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records_list = db_controller.get_lvol_stats(lvol, limit=records_number)
     cap_stats_keys = [
         "date",
         "size_total",
@@ -1547,6 +1540,8 @@ def get_capacity(lvol_uuid, history, records_count=20, parse_sizes=True):
         "size_prov",
         "size_prov_util"
     ]
+    prom_client = PromClient(pool.cluster_id)
+    records_list = prom_client.get_lvol_metrics(lvol_uuid, cap_stats_keys, history)
     new_records = utils.process_records(records_list, records_count, keys=cap_stats_keys)
 
     if not parse_sizes:
@@ -1568,19 +1563,11 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si
     db_controller = DBController()
     try:
         lvol = db_controller.get_lvol_by_id(lvol_uuid)
+        pool = db_controller.get_pool_by_id(lvol.pool_uuid)
     except KeyError as e:
         logger.error(e)
         return False
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records_list = db_controller.get_lvol_stats(lvol, limit=records_number)
     io_stats_keys = [
         "date",
         "read_bytes",
@@ -1591,7 +1578,6 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si
         "write_bytes_ps",
         "write_io_ps",
         "write_latency_ps",
-        "connected_clients",
     ]
     if with_sizes:
         io_stats_keys.extend(
@@ -1616,6 +1602,8 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si
                 "write_latency_ticks",
             ]
         )
+    prom_client = PromClient(pool.cluster_id)
+    records_list = prom_client.get_lvol_metrics(lvol_uuid, io_stats_keys, history)
     # combine records
     new_records = utils.process_records(records_list, records_count, keys=io_stats_keys)
 
@@ -1634,7 +1622,6 @@ def get_io_stats(lvol_uuid, history, records_count=20, parse_sizes=True, with_si
             "Write speed": utils.humanbytes(record['write_bytes_ps']),
             "Write IOPS": record['write_io_ps'],
             "Write lat": record['write_latency_ps'],
-            "Con": record['connected_clients'],
         })
     return out
 
diff --git a/simplyblock_core/controllers/pool_controller.py b/simplyblock_core/controllers/pool_controller.py
index db7016d7d..2440a6bd7 100644
--- a/simplyblock_core/controllers/pool_controller.py
+++ b/simplyblock_core/controllers/pool_controller.py
@@ -12,6 +12,7 @@
 from simplyblock_core.controllers import pool_events, lvol_controller
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.models.pool import Pool
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.rpc_client import RPCClient
 
 logger = lg.getLogger()
@@ -321,15 +322,18 @@ def get_io_stats(pool_id, history, records_count=20):
         logger.error(f"Pool not found {pool_id}")
         return False
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
+    io_stats_keys = [
+        "date",
+        "read_bytes_ps",
+        "read_io_ps",
+        "read_latency_ps",
+        "write_bytes_ps",
+        "write_io_ps",
+        "write_latency_ps",
+    ]
 
-    out = db_controller.get_pool_stats(pool, records_number)
+    prom_client = PromClient(pool.cluster_id)
+    out = prom_client.get_pool_metrics(pool_id, io_stats_keys, history)
     new_records = utils.process_records(out, records_count)
 
     return utils.print_table([
diff --git a/simplyblock_core/controllers/storage_events.py b/simplyblock_core/controllers/storage_events.py
index b73890cd8..027f7dbed 100644
--- a/simplyblock_core/controllers/storage_events.py
+++ b/simplyblock_core/controllers/storage_events.py
@@ -72,3 +72,27 @@ def snode_rpc_timeout(node, timeout_seconds, caused_by=ec.CAUSED_BY_MONITOR):
         event_level=EventObj.LEVEL_WARN,
         message=f"Storage node RPC timeout detected after {timeout_seconds} seconds",
         node_id=node.get_id())
+
+
+def jm_repl_tasks_found(node, jm_vuid, caused_by=ec.CAUSED_BY_MONITOR):
+    ec.log_event_cluster(
+        cluster_id=node.cluster_id,
+        domain=ec.DOMAIN_CLUSTER,
+        event=ec.EVENT_STATUS_CHANGE,
+        db_object=node,
+        caused_by=caused_by,
+        event_level=EventObj.LEVEL_WARN,
+        message=f"JM replication task found for jm {jm_vuid}",
+        node_id=node.get_id())
+
+
+def node_ports_changed(node, caused_by=ec.CAUSED_BY_MONITOR):
+    ec.log_event_cluster(
+        cluster_id=node.cluster_id,
+        domain=ec.DOMAIN_CLUSTER,
+        event=ec.EVENT_STATUS_CHANGE,
+        db_object=node,
+        caused_by=caused_by,
+        event_level=EventObj.LEVEL_WARN,
+        message=f"Storage node ports set, LVol:{node.lvol_subsys_port} RPC:{node.rpc_port} Internal:{node.nvmf_port}",
+        node_id=node.get_id())
diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py
index 689027d08..b7c434f63 100644
--- a/simplyblock_core/controllers/tasks_controller.py
+++ b/simplyblock_core/controllers/tasks_controller.py
@@ -70,6 +70,11 @@ def _add_task(function_name, cluster_id, node_id, device_id,
         if task_id:
             logger.info(f"Task found, skip adding new task: {task_id}")
             return False
+    elif function_name == JobSchedule.FN_LVOL_SYNC_DEL:
+        task_id = get_lvol_sync_del_task(cluster_id, node_id, function_params['lvol_bdev_name'])
+        if task_id:
+            logger.info(f"Task found, skip adding new task: {task_id}")
+            return False
 
     task_obj = JobSchedule()
     task_obj.uuid = str(uuid.uuid4())
@@ -386,3 +391,21 @@ def get_jc_comp_task(cluster_id, node_id, jm_vuid=0):
                 if jm_vuid and "jm_vuid" in task.function_params and task.function_params["jm_vuid"] == jm_vuid:
                     return task.uuid
     return False
+
+
+def add_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name):
+    return _add_task(JobSchedule.FN_LVOL_SYNC_DEL, cluster_id, node_id, "",
+                     function_params={"lvol_bdev_name": lvol_bdev_name}, max_retry=10)
+
+def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None):
+    tasks = db.get_job_tasks(cluster_id)
+    for task in tasks:
+        if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL and task.node_id == node_id :
+            if task.status != JobSchedule.STATUS_DONE and task.canceled is False:
+                if lvol_bdev_name:
+                    if "lvol_bdev_name" in task.function_params and task.function_params["lvol_bdev_name"] == lvol_bdev_name:
+                        return task.uuid
+                else:
+                    return task.uuid
+    return False
+
diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var
index e1d2e2f8b..f34a430a9 100644
--- a/simplyblock_core/env_var
+++ b/simplyblock_core/env_var
@@ -1,5 +1,5 @@
 SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev
-SIMPLY_BLOCK_VERSION=19.2.24
+SIMPLY_BLOCK_VERSION=19.2.27
 
 SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:main
 SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:main-latest
diff --git a/simplyblock_core/mgmt_node_ops.py b/simplyblock_core/mgmt_node_ops.py
index 84375d819..6d752a86c 100644
--- a/simplyblock_core/mgmt_node_ops.py
+++ b/simplyblock_core/mgmt_node_ops.py
@@ -106,18 +106,13 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo
 
         logger.info(f"Node IP: {dev_ip}")
 
-        hostname = utils.get_node_name_by_ip(dev_ip)
-        utils.label_node_as_mgmt_plane(hostname)
         db_connection = cluster_data['db_connection']
         db_controller = DBController()
         nodes = db_controller.get_mgmt_nodes()
         if not nodes:
             logger.error("No mgmt nodes was found in the cluster!")
             return False
-        for node in nodes:
-            if node.hostname == hostname:
-                logger.error("Node already exists in the cluster")
-                return False
+
 
     logger.info("Adding management node object")
     node_id = add_mgmt_node(dev_ip, mode, cluster_id)
@@ -225,10 +220,9 @@ def deploy_mgmt_node(cluster_ip, cluster_id, ifname, mgmt_ip, cluster_secret, mo
 
 def add_mgmt_node(mgmt_ip, mode, cluster_id=None):
     db_controller = DBController()
+    hostname = ""
     if mode == "docker":
         hostname = utils.get_hostname()
-    elif mode == "kubernetes":
-        hostname = utils.get_node_name_by_ip(mgmt_ip)
     try:
         node = db_controller.get_mgmt_node_by_hostname(hostname)
         if node:
diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py
index 3d87a9aca..bbdcd7871 100644
--- a/simplyblock_core/models/job_schedule.py
+++ b/simplyblock_core/models/job_schedule.py
@@ -22,6 +22,7 @@ class JobSchedule(BaseModel):
     FN_BALANCING_AFTER_DEV_REMOVE = "balancing_on_dev_rem"
     FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add"
     FN_JC_COMP_RESUME = "jc_comp_resume"
+    FN_LVOL_SYNC_DEL = "lvol_sync_del"
 
     canceled: bool = False
     cluster_id: str = ""
diff --git a/simplyblock_core/models/storage_node.py b/simplyblock_core/models/storage_node.py
index 8c76d3649..45abceec9 100644
--- a/simplyblock_core/models/storage_node.py
+++ b/simplyblock_core/models/storage_node.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-
+import time
 from typing import List
 from uuid import uuid4
 
@@ -102,7 +102,6 @@ class StorageNode(BaseNodeObject):
     hublvol: HubLVol = None  # type: ignore[assignment]
     active_tcp: bool = True
     active_rdma: bool = False
-    lvol_sync_del_queue: List[str] = []
 
     def rpc_client(self, **kwargs):
         """Return rpc client to this node
@@ -303,3 +302,23 @@ def create_alceml(self, name, nvme_bdev, uuid, **kwargs):
             alceml_worker_cpu_mask=alceml_worker_cpu_mask,
             **kwargs,
         )
+
+    def wait_for_jm_rep_tasks_to_finish(self, jm_vuid):
+        retry = 10
+        while retry > 0:
+            try:
+                jm_replication_tasks = False
+                ret = self.rpc_client().jc_get_jm_status(jm_vuid)
+                for jm in ret:
+                    if ret[jm] is False:  # jm is not ready (has active replication task)
+                        jm_replication_tasks = True
+                        break
+                if jm_replication_tasks:
+                    logger.warning(f"Replication task found on node: {self.get_id()}, jm_vuid: {jm_vuid}, retry...")
+                    retry -= 1
+                    time.sleep(20)
+                else:
+                    return True
+            except Exception:
+                logger.warning("Failed to get replication task!")
+        return False
diff --git a/simplyblock_core/prom_client.py b/simplyblock_core/prom_client.py
new file mode 100644
index 000000000..833d42b36
--- /dev/null
+++ b/simplyblock_core/prom_client.py
@@ -0,0 +1,130 @@
+import logging
+import re
+from datetime import datetime, timedelta
+
+from simplyblock_core import constants
+from simplyblock_core.db_controller import DBController
+from simplyblock_core.models.mgmt_node import MgmtNode
+
+from prometheus_api_client import PrometheusConnect
+
+logger = logging.getLogger()
+
+
+class PromClientException(Exception):
+    def __init__(self, message):
+        self.message = message
+
+
+class PromClient:
+
+    def __init__(self, cluster_id):
+        db_controller = DBController()
+        cluster_ip = None
+        cluster = db_controller.get_cluster_by_id(cluster_id)
+        if cluster.mode == "docker":
+            for node in db_controller.get_mgmt_nodes():
+                if node.cluster_id == cluster_id and node.status == MgmtNode.STATUS_ONLINE:
+                    cluster_ip = node.mgmt_ip
+                    break
+            if cluster_ip is None:
+                raise PromClientException("Cluster has no online mgmt nodes")
+        else:
+            cluster_ip = constants.PROMETHEUS_STATEFULSET_NAME
+        self.ip_address = f"{cluster_ip}:9090"
+        self.url = 'http://%s/' % self.ip_address
+        self.client = PrometheusConnect(url=self.url, disable_ssl=True)
+
+    def parse_history_param(self, history_string):
+        if not history_string:
+            logger.error("Invalid history value")
+            return False
+
+        # process history
+        results = re.search(r'^(\d+[hmd])(\d+[hmd])?$', history_string.lower())
+        if not results:
+            logger.error(f"Error parsing history string: {history_string}")
+            logger.info("History format: xxdyyh , e.g: 1d12h, 1d, 2h, 1m")
+            return False
+
+        history_in_days = 0
+        history_in_hours = 0
+        history_in_minutes = 0
+        for s in results.groups():
+            if not s:
+                continue
+            ind = s[-1]
+            v = int(s[:-1])
+            if ind == 'd':
+                history_in_days = v
+            if ind == 'h':
+                history_in_hours = v
+            if ind == 'm':
+                history_in_minutes = v
+
+        history_in_hours += int(history_in_minutes/60)
+        history_in_minutes = history_in_minutes % 60
+        history_in_days += int(history_in_hours/24)
+        history_in_hours = history_in_hours % 24
+        return history_in_days, history_in_hours, history_in_minutes
+
+    def get_metrics(self, key_prefix, metrics_lst, params, history=None):
+        start_time = datetime.now() - timedelta(minutes=10)
+        if history:
+            try:
+                days,hours,minutes = self.parse_history_param(history)
+                start_time = datetime.now() - timedelta(days=days, hours=hours, minutes=minutes)
+            except Exception:
+                raise PromClientException(f"Error parsing history string: {history}")
+        end_time = datetime.now()
+        data_out: list[dict] = []
+        for key in metrics_lst:
+            metrics = self.client.get_metric_range_data(
+                f"{key_prefix}_{key}", label_config=params, start_time=start_time, end_time=end_time)
+            for m in metrics:
+                mt_name = key
+                mt_values = m["values"]
+                for i, v in enumerate(mt_values):
+                    value = v[1]
+                    try:
+                        value = int(value)
+                    except Exception:
+                        pass
+                    if len(data_out) <= i:
+                        data_out.append({mt_name: value})
+                    else:
+                        d = data_out[i]
+                        if mt_name not in d:
+                            d[mt_name] = value
+
+        return data_out
+
+    def get_cluster_metrics(self, cluster_uuid, metrics_lst, history=None):
+        params = {
+            "cluster": cluster_uuid
+        }
+        return self.get_metrics("cluster", metrics_lst, params, history)
+
+    def get_node_metrics(self, snode_uuid, metrics_lst, history=None):
+        params = {
+            "snode": snode_uuid
+        }
+        return self.get_metrics("snode", metrics_lst, params, history)
+
+    def get_device_metrics(self, device_uuid, metrics_lst, history=None):
+        params = {
+            "device": device_uuid
+        }
+        return self.get_metrics("device", metrics_lst, params, history)
+
+    def get_lvol_metrics(self, lvol_uuid, metrics_lst, history=None):
+        params = {
+            "lvol": lvol_uuid
+        }
+        return self.get_metrics("lvol", metrics_lst, params, history)
+
+    def get_pool_metrics(self, pool_uuid, metrics_lst, history=None):
+        params = {
+            "pool": pool_uuid
+        }
+        return self.get_metrics("pool", metrics_lst, params, history)
diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py
index 886695974..e1269d2fc 100644
--- a/simplyblock_core/rpc_client.py
+++ b/simplyblock_core/rpc_client.py
@@ -922,7 +922,7 @@ def distr_migration_status(self, name):
         params = {"name": name}
         return self._request("distr_migration_status", params)
 
-    def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=1024, jobs=4):
+    def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=False, job_size=64, jobs=64):
         params = {
             "name": name,
             "storage_ID": storage_ID,
@@ -935,7 +935,7 @@ def distr_migration_failure_start(self, name, storage_ID, qos_high_priority=Fals
             params["jobs"] = jobs
         return self._request("distr_migration_failure_start", params)
 
-    def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=1024, jobs=4):
+    def distr_migration_expansion_start(self, name, qos_high_priority=False, job_size=64, jobs=64):
         params = {
             "name": name,
         }
diff --git a/simplyblock_core/scripts/charts/Chart.yaml b/simplyblock_core/scripts/charts/Chart.yaml
index 9d1b62643..380f67bcd 100644
--- a/simplyblock_core/scripts/charts/Chart.yaml
+++ b/simplyblock_core/scripts/charts/Chart.yaml
@@ -26,11 +26,6 @@ dependencies:
     version: "25.18.0"
     repository: "https://prometheus-community.github.io/helm-charts"
     condition: monitoring.enabled
-  - name: openebs
-    version: 3.9.0 
-    repository: https://openebs.github.io/charts
-    alias: openebs
-    condition: openebs.enabled
   - name: ingress-nginx
     version: 4.10.1
     repository: "https://kubernetes.github.io/ingress-nginx"
diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml
index ec2e5b378..988955a4f 100644
--- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml
+++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml
@@ -19,6 +19,8 @@ spec:
         app: simplyblock-admin-control
     spec:
       serviceAccountName: simplyblock-control-sa
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet 
       containers:
       - name: simplyblock-control
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -91,6 +93,17 @@ spec:
             configMapKeyRef:
               name: simplyblock-config
               key: LOG_LEVEL
+        - name: LVOL_NVMF_PORT_START
+          value: "{{ .Values.ports.lvolNvmfPortStart }}"
+        - name: K8S_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: MONITORING_SECRET
+          valueFrom:
+            secretKeyRef:
+              name: simplyblock-grafana-secrets
+              key: MONITORING_SECRET
         - name: FLASK_DEBUG
           value: "False"
         - name: FLASK_ENV
@@ -133,6 +146,8 @@ spec:
       labels:
         app: simplyblock-storage-node-monitor
     spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet 
       containers:
       - name: storage-node-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -183,6 +198,8 @@ spec:
       labels:
         app: simplyblock-mgmt-node-monitor
     spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet 
       containers:
         - name: mgmt-node-monitor
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -235,6 +252,8 @@ spec:
       labels:
         app: simplyblock-lvol-stats-collector
     spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet 
       containers:
         - name: lvol-stats-collector
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -284,7 +303,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-main-distr-event-collector
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: main-distr-event-collector
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -334,7 +355,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-capacity-and-stats-collector
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: capacity-and-stats-collector
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -385,7 +408,8 @@ spec:
       labels:
         app: simplyblock-capacity-monitor
     spec:
-      
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: capacity-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -435,7 +459,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-health-check
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: health-check
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -485,7 +511,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-device-monitor
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: device-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -535,7 +563,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-lvol-monitor
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
       - name: lvol-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -584,7 +614,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-snapshot-monitor
-    spec:     
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet      
       containers:
       - name: snapshot-monitor
         image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -685,7 +717,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-restart
-    spec:     
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet      
       containers:
         - name: tasks-runner-restart
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -735,7 +769,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-migration
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-migration
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -784,7 +820,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-failed-migration
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-failed-migration
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -833,7 +871,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-cluster-status
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-cluster-status
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -882,7 +922,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-new-device-migration
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-new-device-migration
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -931,7 +973,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-node-add-runner
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-node-addrunner
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -983,7 +1027,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-port-allow
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-port-allow
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -1032,7 +1078,9 @@ spec:
         reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
       labels:
         app: simplyblock-tasks-runner-jc-comp-resume
-    spec:      
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet       
       containers:
         - name: tasks-runner-jc-comp-resume
           image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
@@ -1063,6 +1111,57 @@ spec:
             - key: cluster-file
               path: fdb.cluster
 ---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: simplyblock-tasks-runner-sync-lvol-del
+  namespace: {{ .Release.Namespace }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: simplyblock-tasks-runner-sync-lvol-del
+  template:
+    metadata:
+      annotations:
+        log-collector/enabled: "true"
+        reloader.stakater.com/auto: "true"
+        reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config"
+      labels:
+        app: simplyblock-tasks-runner-sync-lvol-del
+    spec:
+      hostNetwork: true
+      dnsPolicy: ClusterFirstWithHostNet
+      containers:
+        - name: tasks-runner-sync-lvol-del
+          image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}"
+          imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}"
+          command: ["python", "simplyblock_core/services/tasks_runner_sync_lvol_del.py"]
+          env:
+            - name: SIMPLYBLOCK_LOG_LEVEL
+              valueFrom:
+                configMapKeyRef:
+                  name: simplyblock-config
+                  key: LOG_LEVEL
+          volumeMounts:
+          - name: fdb-cluster-file
+            mountPath: /etc/foundationdb/fdb.cluster
+            subPath: fdb.cluster
+          resources:
+            requests:
+              cpu: "200m"
+              memory: "256Mi"
+            limits:
+              cpu: "400m"
+              memory: "1Gi"
+      volumes:
+      - name: fdb-cluster-file
+        configMap:
+          name: simplyblock-fdb-cluster-config
+          items:
+            - key: cluster-file
+              path: fdb.cluster
+---
 
 apiVersion: apps/v1
 kind: DaemonSet
diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
new file mode 100644
index 000000000..2a9d7d044
--- /dev/null
+++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-driverinfo.yaml
@@ -0,0 +1,24 @@
+apiVersion: storage.k8s.io/v1
+kind: CSIDriver
+metadata:
+  name: hostpath.csi.k8s.io
+  labels:
+    app.kubernetes.io/instance: hostpath.csi.k8s.io
+    app.kubernetes.io/part-of: csi-driver-host-path
+    app.kubernetes.io/name: hostpath.csi.k8s.io
+    app.kubernetes.io/component: csi-driver
+spec:
+  # Supports persistent and ephemeral inline volumes.
+  volumeLifecycleModes:
+  - Persistent
+  - Ephemeral
+  # To determine at runtime which mode a volume uses, pod info and its
+  # "csi.storage.k8s.io/ephemeral" entry are needed.
+  podInfoOnMount: true
+  # No attacher needed.
+  attachRequired: false
+  storageCapacity: false
+  # Kubernetes may use fsGroup to change permissions and ownership 
+  # of the volume to match user requested fsGroup in the pod's SecurityPolicy
+  fsGroupPolicy: File
+  
\ No newline at end of file
diff --git a/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml
new file mode 100644
index 000000000..8e695e593
--- /dev/null
+++ b/simplyblock_core/scripts/charts/templates/csi-hostpath-plugin.yaml
@@ -0,0 +1,233 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: csi-hostpathplugin-sa
+  namespace: {{ .Release.Namespace }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: csi-hostpathplugin
+rules:
+  - apiGroups: [""]
+    resources: ["persistentvolumes"]
+    verbs: ["get", "list", "watch", "create", "delete", "update", "patch"]
+  - apiGroups: [""]
+    resources: ["persistentvolumeclaims"]
+    verbs: ["get", "list", "watch", "update"]
+  - apiGroups: [""]
+    resources: ["persistentvolumeclaims/status"]
+    verbs: ["get", "update", "patch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["volumeattachments"]
+    verbs: ["get", "list", "watch", "update"]
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["csinodes"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["storageclasses"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["csistoragecapacities"]
+    verbs: ["get", "list", "watch", "create", "update", "delete"]
+  - apiGroups: [""]
+    resources: ["events"]
+    verbs: ["create", "patch", "update", "get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: csi-hostpathplugin
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: csi-hostpathplugin
+subjects:
+  - kind: ServiceAccount
+    name: csi-hostpathplugin-sa
+    namespace: {{ .Release.Namespace }}
+
+---
+kind: DaemonSet
+apiVersion: apps/v1
+metadata:
+  name: csi-hostpathplugin
+  labels:
+    app.kubernetes.io/instance: hostpath.csi.k8s.io
+    app.kubernetes.io/part-of: csi-driver-host-path
+    app.kubernetes.io/name: csi-hostpathplugin
+    app.kubernetes.io/component: plugin
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/instance: hostpath.csi.k8s.io
+      app.kubernetes.io/part-of: csi-driver-host-path
+      app.kubernetes.io/name: csi-hostpathplugin
+      app.kubernetes.io/component: plugin
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/instance: hostpath.csi.k8s.io
+        app.kubernetes.io/part-of: csi-driver-host-path
+        app.kubernetes.io/name: csi-hostpathplugin
+        app.kubernetes.io/component: plugin
+    spec:
+      serviceAccountName: csi-hostpathplugin-sa
+      containers:
+        - name: csi-provisioner
+          image: registry.k8s.io/sig-storage/csi-provisioner:v6.0.0
+          args:
+            - -v=5
+            - --csi-address=/csi/csi.sock
+            - --feature-gates=Topology=true
+            - --node-deployment=true
+            - --strict-topology=true
+            - --immediate-topology=false
+            - --worker-threads=5
+          env:
+          - name: NODE_NAME
+            valueFrom:
+              fieldRef:
+                apiVersion: v1
+                fieldPath: spec.nodeName
+          - name: NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          securityContext:
+            # This is necessary only for systems with SELinux, where
+            # non-privileged sidecar containers cannot access unix domain socket
+            # created by privileged CSI driver container.
+            privileged: true
+          volumeMounts:
+            - mountPath: /csi
+              name: socket-dir
+        - name: csi-resizer
+          image: registry.k8s.io/sig-storage/csi-resizer:v2.0.0
+          args:
+            - -v=5
+            - -csi-address=/csi/csi.sock
+          securityContext:
+            # This is necessary only for systems with SELinux, where
+            # non-privileged sidecar containers cannot access unix domain socket
+            # created by privileged CSI driver container.
+            privileged: true
+          volumeMounts:
+            - mountPath: /csi
+              name: socket-dir
+
+        - name: node-driver-registrar
+          image: registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.12.0
+          args:
+            - --v=5
+            - --csi-address=/csi/csi.sock
+            - --kubelet-registration-path=/var/lib/kubelet/plugins/csi-hostpath/csi.sock
+          securityContext:
+            # This is necessary only for systems with SELinux, where
+            # non-privileged sidecar containers cannot access unix domain socket
+            # created by privileged CSI driver container.
+            privileged: true
+          env:
+            - name: KUBE_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: spec.nodeName
+          volumeMounts:
+          - mountPath: /csi
+            name: socket-dir
+          - mountPath: /registration
+            name: registration-dir
+          - mountPath: /csi-data-dir
+            name: csi-data-dir
+
+        - name: hostpath
+          image: registry.k8s.io/sig-storage/hostpathplugin:v1.15.0
+          args:
+            - --drivername=hostpath.csi.k8s.io
+            - --v=5
+            - --endpoint=$(CSI_ENDPOINT)
+            - --nodeid=$(KUBE_NODE_NAME)
+            - --capacity=slow=10Gi
+            - --capacity=fast=100Gi
+          env:
+            - name: CSI_ENDPOINT
+              value: unix:///csi/csi.sock
+            - name: KUBE_NODE_NAME
+              valueFrom:
+                fieldRef:
+                  apiVersion: v1
+                  fieldPath: spec.nodeName
+          securityContext:
+            privileged: true
+          ports:
+          - containerPort: 9898
+            name: healthz
+            protocol: TCP
+          livenessProbe:
+            failureThreshold: 5
+            httpGet:
+              path: /healthz
+              port: healthz
+            initialDelaySeconds: 10
+            timeoutSeconds: 3
+            periodSeconds: 2
+          volumeMounts:
+            - mountPath: /csi
+              name: socket-dir
+            - mountPath: /var/lib/kubelet/pods
+              mountPropagation: Bidirectional
+              name: mountpoint-dir
+            - mountPath: /var/lib/kubelet/plugins
+              mountPropagation: Bidirectional
+              name: plugins-dir
+            - mountPath: /csi-data-dir
+              name: csi-data-dir
+            - mountPath: /dev
+              name: dev-dir
+        - name: liveness-probe
+          volumeMounts:
+          - mountPath: /csi
+            name: socket-dir
+          image: registry.k8s.io/sig-storage/livenessprobe:v2.15.0
+          args:
+          - --csi-address=/csi/csi.sock
+          - --health-port=9898
+
+      volumes:
+        - hostPath:
+            path: /var/lib/kubelet/plugins/csi-hostpath
+            type: DirectoryOrCreate
+          name: socket-dir
+        - hostPath:
+            path: /var/lib/kubelet/pods
+            type: DirectoryOrCreate
+          name: mountpoint-dir
+        - hostPath:
+            path: /var/lib/kubelet/plugins_registry
+            type: Directory
+          name: registration-dir
+        - hostPath:
+            path: /var/lib/kubelet/plugins
+            type: Directory
+          name: plugins-dir
+        - hostPath:
+            # 'path' is where PV data is persisted on host.
+            # using /tmp is also possible while the PVs will not available after plugin container recreation or host reboot
+            path: /var/lib/csi-hostpath-data/
+            type: DirectoryOrCreate
+          name: csi-data-dir
+        - hostPath:
+            path: /dev
+            type: Directory
+          name: dev-dir
diff --git a/simplyblock_core/scripts/charts/templates/foundationdb.yaml b/simplyblock_core/scripts/charts/templates/foundationdb.yaml
index 1a3134e58..4eb7f1410 100644
--- a/simplyblock_core/scripts/charts/templates/foundationdb.yaml
+++ b/simplyblock_core/scripts/charts/templates/foundationdb.yaml
@@ -2,20 +2,20 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: controller-manager
+  name: simplyblock-fdb-controller-manager
   labels:
-    control-plane: controller-manager
-    app: controller-manager
+    control-plane: simplyblock-fdb-controller-manager
+    app: simplyblock-fdb-controller-manager
 spec:
   selector:
     matchLabels:
-      app: controller-manager
+      app: simplyblock-fdb-controller-manager
   replicas: 1
   template:
     metadata:
       labels:
-        control-plane: controller-manager
-        app: controller-manager
+        control-plane: simplyblock-fdb-controller-manager
+        app: simplyblock-fdb-controller-manager
     spec:
       securityContext:
         runAsUser: 4059
@@ -28,7 +28,7 @@ spec:
           emptyDir: {}
         - name: fdb-binaries
           emptyDir: {}
-      serviceAccountName: controller-manager
+      serviceAccountName: simplyblock-fdb-controller-manager
       initContainers:
         - name: foundationdb-kubernetes-init-7-3
           image: foundationdb/fdb-kubernetes-monitor:7.3.63
@@ -51,6 +51,8 @@ spec:
       containers:
         - command:
             - /manager
+          args:
+            - "--health-probe-bind-address=:9443"
           image: foundationdb/fdb-kubernetes-operator:v2.13.0
           name: manager
           env:
@@ -86,13 +88,13 @@ spec:
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: controller-manager
+  name: simplyblock-fdb-controller-manager
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
-  name: manager-role
+  name: simplyblock-fdb-manager-role
 rules:
 - apiGroups:
   - ""
@@ -164,7 +166,7 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   creationTimestamp: null
-  name: manager-clusterrole
+  name: simplyblock-fdb-manager-clusterrole
 rules:
 - apiGroups:
   - ""
@@ -179,27 +181,27 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
   creationTimestamp: null
-  name: manager-rolebinding
+  name: simplyblock-fdb-manager-rolebinding
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
-  name: manager-role
+  name: simplyblock-fdb-manager-role
 subjects:
 - kind: ServiceAccount
-  name: controller-manager
+  name: simplyblock-fdb-controller-manager
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
   creationTimestamp: null
-  name: manager-clusterrolebinding
+  name: simplyblock-fdb-manager-clusterrolebinding
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
-  name: manager-clusterrole
+  name: simplyblock-fdb-manager-clusterrole
 subjects:
 - kind: ServiceAccount
-  name: controller-manager
+  name: simplyblock-fdb-controller-manager
   namespace: metadata.namespace
 
 ##### cluster file #################
@@ -213,7 +215,11 @@ spec:
     replacements:
       enabled: true
   faultDomain:
+  {{- if .Values.foundationdb.multiAZ }}
+    key: topology.kubernetes.io/zone
+  {{- else }}
     key: foundationdb.org/none
+  {{- end }}
   imageType: split
   labels:
     filterOnOwnerReference: false
@@ -225,10 +231,17 @@ spec:
     - foundationdb.org/fdb-process-group-id
   minimumUptimeSecondsForBounce: 60
   processCounts:
+  {{- if .Values.foundationdb.multiAZ }}
+    cluster_controller: 1
+    log: 4
+    storage: 4
+    stateless: -1
+  {{- else }}
     cluster_controller: 1
     log: 3
     storage: 3
     stateless: -1
+  {{- end }}
   processes:
     general:
       customParameters:
@@ -270,7 +283,7 @@ spec:
               runAsUser: 0
       volumeClaimTemplate:
         spec:
-          storageClassName: openebs-local-hostpath
+          storageClassName: local-hostpath
           accessModes:
             - ReadWriteOnce
           resources:
@@ -285,10 +298,10 @@ spec:
             resources:
               limits:
                 cpu: 500m
-                memory: 2Gi
+                memory: 4Gi
               requests:
                 cpu: 100m
-                memory: 512Mi
+                memory: 1Gi
             securityContext:
               runAsUser: 0
           affinity:
@@ -308,10 +321,10 @@ spec:
             resources:
               limits:
                 cpu: 500m
-                memory: 2Gi
+                memory: 4Gi
               requests:
                 cpu: 100m
-                memory: 512Mi
+                memory: 1Gi
             securityContext:
               runAsUser: 0
           affinity:
diff --git a/simplyblock_core/scripts/charts/templates/mongodb.yaml b/simplyblock_core/scripts/charts/templates/mongodb.yaml
index 740dd7642..815df6505 100644
--- a/simplyblock_core/scripts/charts/templates/mongodb.yaml
+++ b/simplyblock_core/scripts/charts/templates/mongodb.yaml
@@ -14,7 +14,7 @@ spec:
             name: data-volume
           spec:
             accessModes: [ "ReadWriteOnce" ]
-            storageClassName: openebs-local-hostpath
+            storageClassName: local-hostpath
             resources:
               requests:
                 storage: 5Gi
@@ -22,7 +22,7 @@ spec:
             name: logs-volume
           spec:
             accessModes: [ "ReadWriteOnce" ]
-            storageClassName: openebs-local-hostpath
+            storageClassName: local-hostpath
             resources:
               requests:
                 storage: 5Gi
diff --git a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml
index 9c0f46e1f..1349a33a9 100644
--- a/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml
+++ b/simplyblock_core/scripts/charts/templates/monitoring_k8s.yaml
@@ -68,6 +68,8 @@ spec:
               value: "false"
             - name: GRAYLOG_ELASTICSEARCH_REPLICAS
               value: "1"
+            - name: GRAYLOG_MESSAGE_JOURNAL_MAX_SIZE
+              value: "10gb"
           ports:
             - containerPort: 5044
             - containerPort: 5140
diff --git a/simplyblock_core/scripts/charts/templates/storage_class.yaml b/simplyblock_core/scripts/charts/templates/storage_class.yaml
index 64e5e6280..b23cb4a07 100644
--- a/simplyblock_core/scripts/charts/templates/storage_class.yaml
+++ b/simplyblock_core/scripts/charts/templates/storage_class.yaml
@@ -2,9 +2,22 @@
 apiVersion: storage.k8s.io/v1
 kind: StorageClass
 metadata:
-  name: openebs-local-hostpath
-provisioner: openebs.io/local
+  name: local-hostpath
+  labels:
+    app.kubernetes.io/instance: hostpath.csi.k8s.io
+    app.kubernetes.io/part-of: csi-driver-host-path
+    app.kubernetes.io/name: csi-hostpath-fast
+    app.kubernetes.io/component: storageclass
+provisioner: hostpath.csi.k8s.io
 allowVolumeExpansion: true
 reclaimPolicy: Retain
 volumeBindingMode: WaitForFirstConsumer
-  
+{{- if .Values.storageclass.allowedTopologyZones }}
+allowedTopologies:
+- matchLabelExpressions:
+  - key: topology.kubernetes.io/zone
+    values:
+{{- range .Values.storageclass.allowedTopologyZones }}
+    - {{ . }}
+{{- end }}
+{{- end }}
diff --git a/simplyblock_core/scripts/charts/values-template.yaml b/simplyblock_core/scripts/charts/values-template.yaml
deleted file mode 100644
index 79693e7cd..000000000
--- a/simplyblock_core/scripts/charts/values-template.yaml
+++ /dev/null
@@ -1,194 +0,0 @@
-graylog:
-  rootPasswordSha2: "${GRAYLOG_ROOT_PASSWORD_SHA2}"
-  passwordSecret: "${GRAYLOG_PASSWORD_SECRET}"
-
-cluster:
-  secret: "${CLUSTER_SECRET}"
-  id: "${CLUSTER_ID}"
-  ip: "${CLUSTER_IP}"
-
-monitoring:
-  enabled: ${ENABLE_MONITORING}
-
-log:
-  deletionInterval: "${LOG_DELETION_INTERVAL}"
-  retentionPeriod: "${RETENTION_PERIOD}"
-  level: "${LOG_LEVEL}"
-  maxNumberIndex: "${MAX_NUMBER_OF_INDICES}"
-
-grafana:
-  endpoint: "${GRAFANA_ENDPOINT}"
-  contactPoint: "${CONTACT_POINT}"
-
-image:
-  simplyblock: 
-    repository: "${SIMPLYBLOCK_REPOSITORY}"
-    tag: "${SIMPLYBLOCK_TAG}"
-    pullPolicy: "Always"
-
-openebs:
-  enabled: true
-
-mongodb:
-  name: "simplyblock-mongodb"
-  deployment_name: "simplyblock-mongodb"
-  resources:
-    requests:
-      cpu: 100m
-      memory: 300Mi
-    limits:
-      cpu: 250m
-      memory: 1Gi
-  affinity:
-    podAntiAffinity:
-      requiredDuringSchedulingIgnoredDuringExecution:
-        - labelSelector:
-            matchExpressions:
-              - key: app.kubernetes.io/component
-                operator: In
-                values:
-                  - mongodb
-          topologyKey: "kubernetes.io/hostname"
-
-opensearch:
-  fullnameOverride: "simplyblock-opensearch"
-  singleNode: true
-  replicas: 1
-
-  antiAffinity: "hard"
-  persistence:
-    enabled: true
-    storageClass: openebs-local-hostpath
-    size: 10Gi
-
-  resources:
-    requests:
-      cpu: "100m"
-      memory: "512Mi"
-    limits:
-      cpu: "500m"
-      memory: "3Gi"
-
-  extraEnvs:
-    - name: OPENSEARCH_JAVA_OPTS
-      value: "-Xms1g -Xmx1g"
-    - name: bootstrap.memory_lock
-      value: "true"
-    - name: action.auto_create_index
-      value: "false"
-    - name: plugins.security.ssl.http.enabled
-      value: "false"
-    - name: plugins.security.disabled
-      value: "true"
-
-  securityConfig:
-    enabled: false
-
-prometheus:
-  server:
-    fullnameOverride: simplyblock-prometheus
-    enabled: true
-    statefulSet:
-      enabled: true
-    name: simplyblock-prometheus
-    replicaCount: 1
-    podLabels:
-      app: simplyblock-prometheus
-    podAnnotations: {}
-    affinity:
-      podAntiAffinity:
-        requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-                - key: app.kubernetes.io/component
-                  operator: In
-                  values:
-                    - simplyblock-prometheus
-            topologyKey: "kubernetes.io/hostname"
-    service:
-      servicePort: 9090
-      type: ClusterIP
-      gRPC:
-        enabled: true
-        servicePort: 10901
-      additionalPorts:
-        - name: http-thanos
-          port: 10902
-          targetPort: 10902
-          protocol: TCP
-    securityContext:
-      fsGroup: 65534
-    persistentVolume:
-      enabled: true
-      size: 5Gi
-      storageClass: openebs-local-hostpath
-    extraArgs:
-      storage.tsdb.min-block-duration: 2h
-      storage.tsdb.max-block-duration: 2h
-    sidecarContainers:
-      thanos-sidecar:
-        image: thanosio/thanos:v0.31.0
-        args:
-          - sidecar
-          - --tsdb.path=/prometheus
-          - --prometheus.url=http://localhost:9090
-          - --objstore.config-file=/etc/thanos/objstore.yml
-        ports:
-          - name: grpc
-            containerPort: 10901
-          - name: http
-            containerPort: 10902
-        volumeMounts:
-          - name: storage-volume
-            mountPath: /prometheus
-          - name: objstore-config
-            mountPath: /etc/thanos
-        resources:
-          requests:
-            cpu: "100m"
-            memory: "256Mi"
-          limits:
-            cpu: "250m"
-            memory: "1Gi"
-    resources:
-      requests:
-        cpu: "100m"
-        memory: "512Mi"
-      limits:
-        cpu: "500m"
-        memory: "1Gi"
-    configMapOverrideName: simplyblock-prometheus-config
-    extraVolumes:
-      - name: objstore-config
-        configMap:
-          name: simplyblock-objstore-config
-  alertmanager:
-    enabled: false
-
-  prometheus-pushgateway:
-    enabled: false
-
-  prometheus-node-exporter:
-    enabled: false
-
-  kube-state-metrics:
-    enabled: false
-
-ingress:
-  enabled: true
-  ingressClassName: nginx
-  useDNS: ${USE_DNS}
-  host: "${DNS_NAME}"
-  tlsSecret: ${TLS_SECRET}
-  controller:
-    hostNetwork: ${USE_HOST}  
-    dnsPolicy: ClusterFirstWithHostNet
-    service:
-      type: ${SERVICE_TYPE}
-      nodePorts:
-        tcp:
-          4501: 32451 
-    extraArgs:
-      tcp-services-configmap: "${K8S_NAMESPACE}/simplyblock-tcp-services"
-    nodeSelector:
-      simplyblock.io/role: mgmt-plane
diff --git a/simplyblock_core/scripts/charts/values.yaml b/simplyblock_core/scripts/charts/values.yaml
index 467734176..0b70f321e 100644
--- a/simplyblock_core/scripts/charts/values.yaml
+++ b/simplyblock_core/scripts/charts/values.yaml
@@ -24,9 +24,12 @@ image:
 
 ports:
   lvolNvmfPortStart:
-  
-openebs:
-  enabled: true
+
+storageclass:  
+  allowedTopologyZones: []
+
+foundationdb:
+  multiAZ: false
 
 mongodb:
   name: "simplyblock-mongodb"
@@ -57,7 +60,7 @@ opensearch:
   antiAffinity: "hard"
   persistence:
     enabled: true
-    storageClass: openebs-local-hostpath
+    storageClass: local-hostpath
     size: 10Gi
 
   resources:
@@ -123,7 +126,7 @@ prometheus:
     persistentVolume:
       enabled: true
       size: 5Gi
-      storageClass: openebs-local-hostpath
+      storageClass: local-hostpath
     extraArgs:
       storage.tsdb.min-block-duration: 2h
       storage.tsdb.max-block-duration: 2h
diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml
index ba0f8b61d..fd79f43c1 100644
--- a/simplyblock_core/scripts/docker-compose-swarm.yml
+++ b/simplyblock_core/scripts/docker-compose-swarm.yml
@@ -349,6 +349,20 @@ services:
     environment:
       SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL"
 
+  TasksRunnerLVolSyncDelete:
+    <<: *service-base
+    image: $SIMPLYBLOCK_DOCKER_IMAGE
+    command: "python simplyblock_core/services/tasks_runner_sync_lvol_del.py"
+    deploy:
+      placement:
+        constraints: [node.role == manager]
+    volumes:
+      - "/etc/foundationdb:/etc/foundationdb"
+    networks:
+      - hostnet
+    environment:
+      SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL"
+
 networks:
   monitoring-net:
     external: true
diff --git a/simplyblock_core/scripts/install_deps.sh b/simplyblock_core/scripts/install_deps.sh
index 256a55500..56d0bf96e 100644
--- a/simplyblock_core/scripts/install_deps.sh
+++ b/simplyblock_core/scripts/install_deps.sh
@@ -2,15 +2,15 @@
 
 if [[ "$1" == "docker" ]]; then
   sudo yum install -y yum-utils
-  sudo yum install -y https://repo.almalinux.org/almalinux/9/devel/aarch64/os/Packages/tuned-profiles-realtime-2.24.0-1.el9.noarch.rpm
+  sudo yum install -y https://repo.almalinux.org/almalinux/9/devel/aarch64/os/Packages/tuned-profiles-realtime-2.26.0-1.el9.noarch.rpm
   sudo yum install -y yum-utils xorg-x11-xauth nvme-cli fio tuned
 
   sudo yum install hostname pkg-config git wget python3-pip yum-utils \
     iptables pciutils -y
 
     sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
-    sudo yum install docker-ce docker-ce-cli \
-      containerd.io docker-buildx-plugin docker-compose-plugin -y
+    sudo yum install docker-ce-29.1.3-1.el9 docker-ce-cli-29.1.3-1.el9 \
+      containerd.io-2.2.0-2.el9 docker-buildx-plugin-0.30.1-1.el9 docker-compose-plugin-5.0.1-1.el9 -y
 
   sudo systemctl enable docker
   sudo systemctl start docker
diff --git a/simplyblock_core/services/capacity_and_stats_collector.py b/simplyblock_core/services/capacity_and_stats_collector.py
index 6f702d051..022dd84b5 100644
--- a/simplyblock_core/services/capacity_and_stats_collector.py
+++ b/simplyblock_core/services/capacity_and_stats_collector.py
@@ -83,6 +83,11 @@ def add_device_stats(cl, device, capacity_dict, stats_dict):
     stat_obj.write_to_db(db.kv_store)
     last_object_record[device.get_id()] = stat_obj
 
+    all_stats = db.get_device_stats(device, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
@@ -117,6 +122,11 @@ def add_node_stats(node, records):
     stat_obj = NodeStatObject(data=data)
     stat_obj.write_to_db(db.kv_store)
 
+    all_stats = db.get_node_stats(node, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
@@ -146,6 +156,11 @@ def add_cluster_stats(cl, records):
     stat_obj = ClusterStatObject(data=data)
     stat_obj.write_to_db(db.kv_store)
 
+    all_stats = db.get_cluster_stats(cl, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
diff --git a/simplyblock_core/services/lvol_monitor.py b/simplyblock_core/services/lvol_monitor.py
index 884b67396..8486f3a32 100644
--- a/simplyblock_core/services/lvol_monitor.py
+++ b/simplyblock_core/services/lvol_monitor.py
@@ -132,8 +132,7 @@ def process_lvol_delete_finish(lvol):
         sec_node = db.get_storage_node_by_id(snode.get_id())
 
     if sec_node:
-        sec_node.lvol_sync_del_queue.append(f"{lvol.lvs_name}/{lvol.lvol_bdev}")
-        sec_node.write_to_db()
+        tasks_controller.add_lvol_sync_del_task(sec_node.cluster_id, sec_node.get_id(), f"{lvol.lvs_name}/{lvol.lvol_bdev}")
 
     lvol_events.lvol_delete(lvol)
     lvol.remove(db.kv_store)
@@ -349,19 +348,6 @@ def process_lvol_delete_try_again(lvol):
                     present = health_controller.check_bdev(snap.snap_bdev, bdev_names=node_bdev_names)
                     set_snapshot_health_check(snap, present)
 
-                snode = db.get_storage_node_by_id(snode.get_id())
-                if snode.status == StorageNode.STATUS_ONLINE:
-                    not_deleted = []
-                    for bdev_name in snode.lvol_sync_del_queue:
-                        logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}")
-                        ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True)
-                        if not ret:
-                            if "code" in err and err["code"] == -19:
-                                logger.error(f"Sync delete completed with error: {err}")
-                            else:
-                                logger.error(f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}")
-                                not_deleted.append(bdev_name)
-                    snode.lvol_sync_del_queue = not_deleted
-                    snode.write_to_db()
+
 
     time.sleep(constants.LVOL_MONITOR_INTERVAL_SEC)
diff --git a/simplyblock_core/services/lvol_stat_collector.py b/simplyblock_core/services/lvol_stat_collector.py
index 09aa7d571..1933b6703 100644
--- a/simplyblock_core/services/lvol_stat_collector.py
+++ b/simplyblock_core/services/lvol_stat_collector.py
@@ -154,6 +154,11 @@ def add_lvol_stats(cluster, lvol, stats_list, capacity_dict=None):
     stat_obj.write_to_db(db.kv_store)
     last_object_record[lvol.get_id()] = stat_obj
 
+    all_stats = db.get_lvol_stats(lvol, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
@@ -173,6 +178,12 @@ def add_pool_stats(pool, records):
 
     stat_obj = PoolStatObject(data=data)
     stat_obj.write_to_db(db.kv_store)
+
+    all_stats = db.get_pool_stats(pool, limit=0)
+    if len(all_stats) > 10:
+        for st in all_stats[10:]:
+            st.remove(db.kv_store)
+
     return stat_obj
 
 
diff --git a/simplyblock_core/services/snapshot_monitor.py b/simplyblock_core/services/snapshot_monitor.py
index c82476e7b..a99ed89f3 100644
--- a/simplyblock_core/services/snapshot_monitor.py
+++ b/simplyblock_core/services/snapshot_monitor.py
@@ -5,7 +5,7 @@
 
 from simplyblock_core import constants, db_controller, utils
 from simplyblock_core.models.cluster import Cluster
-from simplyblock_core.controllers import health_controller, snapshot_events
+from simplyblock_core.controllers import health_controller, snapshot_events, tasks_controller
 from simplyblock_core.models.snapshot import SnapShot
 from simplyblock_core.models.storage_node import StorageNode
 from simplyblock_core.rpc_client import RPCClient
@@ -76,8 +76,7 @@ def process_snap_delete_finish(snap, leader_node):
 
     non_leader = db.get_storage_node_by_id(non_leader_id)
     if non_leader:
-        non_leader.lvol_sync_del_queue.append(snap.snap_bdev)
-        non_leader.write_to_db()
+        tasks_controller.add_lvol_sync_del_task(non_leader.cluster_id, non_leader.get_id(), snap.snap_bdev)
 
     snapshot_events.snapshot_delete(snap)
     snap.remove(db.kv_store)
diff --git a/simplyblock_core/services/storage_node_monitor.py b/simplyblock_core/services/storage_node_monitor.py
index 17a7d0369..e7f32ad82 100644
--- a/simplyblock_core/services/storage_node_monitor.py
+++ b/simplyblock_core/services/storage_node_monitor.py
@@ -5,7 +5,8 @@
 
 
 from simplyblock_core import constants, db_controller, cluster_ops, storage_node_ops, utils
-from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events
+from simplyblock_core.controllers import health_controller, device_controller, tasks_controller, storage_events, \
+    cluster_events
 from simplyblock_core.models.cluster import Cluster
 from simplyblock_core.models.job_schedule import JobSchedule
 from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
@@ -134,10 +135,13 @@ def update_cluster_status(cluster_id):
             JobSchedule.FN_DEV_MIG, JobSchedule.FN_NEW_DEV_MIG, JobSchedule.FN_FAILED_DEV_MIG]:
             if task.retry == 0:
                 first_iter_task_pending += 1
-
+    is_re_balancing = first_iter_task_pending  > 0
     cluster = db.get_cluster_by_id(cluster_id)
-    cluster.is_re_balancing = first_iter_task_pending  > 0
-    cluster.write_to_db()
+    if cluster.is_re_balancing != is_re_balancing:
+        old_status = cluster.is_re_balancing
+        cluster.is_re_balancing = is_re_balancing
+        cluster.write_to_db()
+        cluster_events.cluster_rebalancing_change(cluster_id, cluster.is_re_balancing, old_status)
 
     current_cluster_status = cluster.status
     logger.info("cluster_status: %s", current_cluster_status)
@@ -289,7 +293,7 @@ def node_rpc_timeout_check_and_report(node):
             spdk_process = False
             if node_api_check:
                 # 3- check spdk_process
-                spdk_process = health_controller._check_spdk_process_up(snode.mgmt_ip, snode.rpc_port)
+                spdk_process = health_controller._check_spdk_process_up(snode.mgmt_ip, snode.rpc_port, snode.cluster_id)
             logger.info(f"Check: spdk process {snode.mgmt_ip}:5000 ... {spdk_process}")
 
                 # 4- check rpc
diff --git a/simplyblock_core/services/tasks_runner_failed_migration.py b/simplyblock_core/services/tasks_runner_failed_migration.py
index fce4fd8ef..7d0b3e89f 100644
--- a/simplyblock_core/services/tasks_runner_failed_migration.py
+++ b/simplyblock_core/services/tasks_runner_failed_migration.py
@@ -88,7 +88,7 @@ def task_runner(task):
         if db.get_cluster_by_id(snode.cluster_id).is_qos_set():
             qos_high_priority = True
         rsp = rpc_client.distr_migration_failure_start(
-            distr_name, device.cluster_device_order, qos_high_priority, job_size=1024, jobs=constants.MIG_PARALLEL_JOBS)
+            distr_name, device.cluster_device_order, qos_high_priority, job_size=64, jobs=constants.MIG_PARALLEL_JOBS)
         if not rsp:
             logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}")
             task.function_result = "Failed to start device migration task"
diff --git a/simplyblock_core/services/tasks_runner_jc_comp.py b/simplyblock_core/services/tasks_runner_jc_comp.py
index 676156af3..6caf85b19 100644
--- a/simplyblock_core/services/tasks_runner_jc_comp.py
+++ b/simplyblock_core/services/tasks_runner_jc_comp.py
@@ -57,6 +57,7 @@
                         if node.status != StorageNode.STATUS_ONLINE:
                             msg = f"Node is {node.status}, retry task"
                             logger.info(msg)
+                            task.retry += 1
                             task.function_result = msg
                             task.status = JobSchedule.STATUS_SUSPENDED
                             task.write_to_db(db.kv_store)
@@ -79,6 +80,7 @@
                                     logger.info(msg)
                                     task.function_result = msg
                                     task.status = JobSchedule.STATUS_SUSPENDED
+                                    task.retry += 1
                                     task.write_to_db(db.kv_store)
                                     continue
 
diff --git a/simplyblock_core/services/tasks_runner_migration.py b/simplyblock_core/services/tasks_runner_migration.py
index fb085e4aa..e325e3d7e 100644
--- a/simplyblock_core/services/tasks_runner_migration.py
+++ b/simplyblock_core/services/tasks_runner_migration.py
@@ -93,7 +93,7 @@ def task_runner(task):
         qos_high_priority = False
         if db.get_cluster_by_id(snode.cluster_id).is_qos_set():
             qos_high_priority = True
-        rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024,
+        rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64,
                                                          jobs=constants.MIG_PARALLEL_JOBS)
         if not rsp:
             logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}")
diff --git a/simplyblock_core/services/tasks_runner_new_dev_migration.py b/simplyblock_core/services/tasks_runner_new_dev_migration.py
index f62a7f210..9feec7a56 100644
--- a/simplyblock_core/services/tasks_runner_new_dev_migration.py
+++ b/simplyblock_core/services/tasks_runner_new_dev_migration.py
@@ -98,7 +98,7 @@ def task_runner(task):
         qos_high_priority = False
         if db.get_cluster_by_id(snode.cluster_id).is_qos_set():
             qos_high_priority = True
-        rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=1024,
+        rsp = rpc_client.distr_migration_expansion_start(distr_name, qos_high_priority, job_size=64,
                                                          jobs=constants.MIG_PARALLEL_JOBS)
         if not rsp:
             logger.error(f"Failed to start device migration task, storage_ID: {device.cluster_device_order}")
diff --git a/simplyblock_core/services/tasks_runner_port_allow.py b/simplyblock_core/services/tasks_runner_port_allow.py
index a39de42ab..96ffc4664 100644
--- a/simplyblock_core/services/tasks_runner_port_allow.py
+++ b/simplyblock_core/services/tasks_runner_port_allow.py
@@ -3,7 +3,7 @@
 
 
 from simplyblock_core import db_controller, utils, storage_node_ops, distr_controller
-from simplyblock_core.controllers import tcp_ports_events, health_controller
+from simplyblock_core.controllers import tcp_ports_events, health_controller, tasks_controller
 from simplyblock_core.fw_api_client import FirewallClient
 from simplyblock_core.models.job_schedule import JobSchedule
 from simplyblock_core.models.cluster import Cluster
@@ -196,22 +196,24 @@
                             task.status = JobSchedule.STATUS_RUNNING
                             task.write_to_db(db.kv_store)
 
-                        not_deleted = []
-                        for bdev_name in snode.lvol_sync_del_queue:
-                            logger.info(f"Sync delete bdev: {bdev_name} from node: {snode.get_id()}")
-                            ret, err = snode.rpc_client().delete_lvol(bdev_name, del_async=True)
-                            if not ret:
-                                if "code" in err and err["code"] == -19:
-                                    logger.error(f"Sync delete completed with error: {err}")
-                                else:
-                                    logger.error(
-                                        f"Failed to sync delete bdev: {bdev_name} from node: {snode.get_id()}")
-                                    not_deleted.append(bdev_name)
-                        snode.lvol_sync_del_queue = not_deleted
-                        snode.write_to_db()
+                        # wait for lvol sync delete
+                        lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id)
+                        while lvol_sync_del_found:
+                            logger.info("Lvol sync delete task found, waiting")
+                            can_continue = False
+                            time.sleep(3)
+                            lvol_sync_del_found = tasks_controller.get_lvol_sync_del_task(task.cluster_id, task.node_id)
 
                         if sec_node and sec_node.status == StorageNode.STATUS_ONLINE:
                             sec_rpc_client = sec_node.rpc_client()
+                            ret = sec_node.wait_for_jm_rep_tasks_to_finish(node.jm_vuid)
+                            if not ret:
+                                msg = "JM replication task found on secondary"
+                                logger.warning(msg)
+                                task.function_result = msg
+                                task.status = JobSchedule.STATUS_SUSPENDED
+                                task.write_to_db(db.kv_store)
+                                continue
                             sec_rpc_client.bdev_lvol_set_leader(node.lvstore, leader=False, bs_nonleadership=True)
 
                         port_number = task.function_params["port_number"]
diff --git a/simplyblock_core/services/tasks_runner_sync_lvol_del.py b/simplyblock_core/services/tasks_runner_sync_lvol_del.py
new file mode 100644
index 000000000..fbf0c1ee4
--- /dev/null
+++ b/simplyblock_core/services/tasks_runner_sync_lvol_del.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+import time
+
+
+from simplyblock_core import db_controller, utils
+from simplyblock_core.models.job_schedule import JobSchedule
+from simplyblock_core.models.cluster import Cluster
+from simplyblock_core.models.storage_node import StorageNode
+
+logger = utils.get_logger(__name__)
+
+# get DB controller
+db = db_controller.DBController()
+
+
+logger.info("Starting Tasks runner...")
+while True:
+
+    clusters = db.get_clusters()
+    if not clusters:
+        logger.error("No clusters found!")
+    else:
+        for cl in clusters:
+            if cl.status == Cluster.STATUS_IN_ACTIVATION:
+                continue
+
+            tasks = db.get_job_tasks(cl.get_id(), reverse=False)
+            for task in tasks:
+
+                if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL:
+                    if task.status != JobSchedule.STATUS_DONE:
+
+                        # get new task object because it could be changed from cancel task
+                        task = db.get_task_by_id(task.uuid)
+
+                        if task.canceled:
+                            task.function_result = "canceled"
+                            task.status = JobSchedule.STATUS_DONE
+                            task.write_to_db(db.kv_store)
+                            continue
+
+                        node = db.get_storage_node_by_id(task.node_id)
+
+                        if not node:
+                            task.function_result = "node not found"
+                            task.status = JobSchedule.STATUS_DONE
+                            task.write_to_db(db.kv_store)
+                            continue
+
+                        if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]:
+                            msg = f"Node is {node.status}, retry task"
+                            logger.info(msg)
+                            task.function_result = msg
+                            task.status = JobSchedule.STATUS_SUSPENDED
+                            task.write_to_db(db.kv_store)
+                            continue
+
+                        if task.status != JobSchedule.STATUS_RUNNING:
+                            task.status = JobSchedule.STATUS_RUNNING
+                            task.write_to_db(db.kv_store)
+
+                        lvol_bdev_name = task.function_params["lvol_bdev_name"]
+
+                        logger.info(f"Sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}")
+                        ret, err = node.rpc_client().delete_lvol(lvol_bdev_name, del_async=True)
+                        if not ret:
+                            if "code" in err and err["code"] == -19:
+                                logger.error(f"Sync delete completed with error: {err}")
+                            else:
+                                logger.error(
+                                    f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}")
+
+                        task.function_result = f"bdev {lvol_bdev_name} deleted"
+                        task.status = JobSchedule.STATUS_DONE
+                        task.write_to_db(db.kv_store)
+
+    time.sleep(3)
diff --git a/simplyblock_core/snode_client.py b/simplyblock_core/snode_client.py
index 2e8504b08..6f1bee0db 100644
--- a/simplyblock_core/snode_client.py
+++ b/simplyblock_core/snode_client.py
@@ -73,7 +73,7 @@ def _request(self, method, path, payload=None):
         return None, None
 
     def is_live(self):
-        return self._request("GET", "/check")
+        return self._request("GET", "check")
 
     def info(self):
         return self._request("GET", "info")
@@ -81,7 +81,7 @@ def info(self):
     def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None, cluster_ip=None,
                            fdb_connection=None, namespace=None, server_ip=None, rpc_port=None,
                            rpc_username=None, rpc_password=None, multi_threading_enabled=False, timeout=0, ssd_pcie=None,
-                           total_mem=None, system_mem=None, cluster_mode=None):
+                           total_mem=None, system_mem=None, cluster_mode=None, cluster_id=None):
         params = {
             "cluster_ip": cluster_ip,
             "server_ip": server_ip,
@@ -113,6 +113,8 @@ def spdk_process_start(self, l_cores, spdk_mem, spdk_image=None, spdk_debug=None
             params["system_mem"] = system_mem
         if cluster_mode:
             params["cluster_mode"] = cluster_mode
+        if cluster_id:
+            params["cluster_id"] = cluster_id
         return self._request("POST", "spdk_process_start", params)
 
     def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id):
@@ -124,8 +126,8 @@ def join_swarm(self, cluster_ip, join_token, db_connection, cluster_id):
         #     "db_connection": db_connection}
         # return self._request("POST", "join_swarm", params)
 
-    def spdk_process_kill(self, rpc_port):
-        return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port})
+    def spdk_process_kill(self, rpc_port, cluster_id=None):
+        return self._request("GET", "spdk_process_kill", {"rpc_port": rpc_port, "cluster_id": cluster_id})
 
     def leave_swarm(self):
         return True
@@ -152,8 +154,8 @@ def bind_device_to_spdk(self, device_pci):
         params = {"device_pci": device_pci}
         return self._request("POST", "bind_device_to_spdk", params)
 
-    def spdk_process_is_up(self, rpc_port):
-        params = {"rpc_port": rpc_port}
+    def spdk_process_is_up(self, rpc_port, cluster_id):
+        params = {"rpc_port": rpc_port, "cluster_id": cluster_id}
         return self._request("GET", "spdk_process_is_up", params)
 
     def get_file_content(self, file_name):
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index ae930b44f..ea9186960 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -33,6 +33,7 @@
 from simplyblock_core.models.snapshot import SnapShot
 from simplyblock_core.models.storage_node import StorageNode
 from simplyblock_core.models.cluster import Cluster
+from simplyblock_core.prom_client import PromClient
 from simplyblock_core.rpc_client import RPCClient, RPCException
 from simplyblock_core.snode_client import SNodeClient, SNodeClientException
 from simplyblock_web import node_utils
@@ -424,8 +425,8 @@ def _create_storage_device_stack(rpc_client, nvme, snode, after_restart):
     return nvme
 
 
-def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, jm_percent, partition_size=0):
-    nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev)
+def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, jm_percent, partition_size, nbd_index):
+    nbd_device = rpc_client.nbd_start_disk(nvme.nvme_bdev, f"/dev/nbd{nbd_index}")
     time.sleep(3)
     if not nbd_device:
         logger.error("Failed to start nbd dev")
@@ -458,79 +459,84 @@ def _create_device_partitions(rpc_client, nvme, snode, num_partitions_per_dev, j
 
 def _prepare_cluster_devices_partitions(snode, devices):
     db_controller = DBController()
-    rpc_client = RPCClient(
-        snode.mgmt_ip, snode.rpc_port,
-        snode.rpc_username, snode.rpc_password)
-
     new_devices = []
-    jm_devices = []
-    dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
-    bdevs_names = [d['name'] for d in rpc_client.get_bdevs()]
+    devices_to_partition = []
+    thread_list = []
     for index, nvme in enumerate(devices):
         if nvme.status == "not_found":
             continue
-
         if nvme.status not in [NVMeDevice.STATUS_ONLINE, NVMeDevice.STATUS_NEW]:
             logger.debug(f"Device is skipped: {nvme.get_id()}, status: {nvme.status}")
             new_devices.append(nvme)
             continue
-
         if nvme.is_partition:
-            dev_part = f"{nvme.nvme_bdev[:-2]}p1"
-            if dev_part in bdevs_names:
-                if dev_part not in jm_devices:
-                    jm_devices.append(dev_part)
-
-            new_device = _create_storage_device_stack(rpc_client, nvme, snode, after_restart=False)
-            if not new_device:
-                logger.error("failed to create dev stack")
-                return False
-            new_devices.append(new_device)
-            if new_device.status == NVMeDevice.STATUS_ONLINE:
-                device_events.device_create(new_device)
-
+            t = threading.Thread(target=_create_storage_device_stack, args=(snode.rpc_client(), nvme, snode, False,))
+            thread_list.append(t)
+            new_devices.append(nvme)
+            t.start()
         else:
-            # look for partitions
-            partitioned_devices = _search_for_partitions(rpc_client, nvme)
-            logger.debug("partitioned_devices")
-            logger.debug(partitioned_devices)
-            if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
-                logger.info("Partitioned devices found")
-            else:
+            devices_to_partition.append(nvme)
+            partitioned_devices = _search_for_partitions(snode.rpc_client(), nvme)
+            if len(partitioned_devices) != (1 + snode.num_partitions_per_dev):
                 logger.info(f"Creating partitions for {nvme.nvme_bdev}")
-                _create_device_partitions(rpc_client, nvme, snode, snode.num_partitions_per_dev, snode.jm_percent,
-                                          snode.partition_size)
-                partitioned_devices = _search_for_partitions(rpc_client, nvme)
-                if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
-                    logger.info("Device partitions created")
-                else:
-                    logger.error("Failed to create partitions")
-                    return False
+                t = threading.Thread(
+                    target=_create_device_partitions,
+                    args=(snode.rpc_client(), nvme, snode, snode.num_partitions_per_dev,
+                          snode.jm_percent, snode.partition_size, index+1,))
+                thread_list.append(t)
+                t.start()
 
-            jm_devices.append(partitioned_devices.pop(0).nvme_bdev)
+    for thread in thread_list:
+        thread.join()
 
+    thread_list = []
+    for nvme in devices_to_partition:
+        partitioned_devices = _search_for_partitions(snode.rpc_client(), nvme)
+        if len(partitioned_devices) == (1 + snode.num_partitions_per_dev):
+            logger.info("Device partitions created")
+            # remove 1st partition for jm
+            partitioned_devices.pop(0)
             for dev in partitioned_devices:
-                ret = _create_storage_device_stack(rpc_client, dev, snode, after_restart=False)
-                if not ret:
-                    logger.error("failed to create dev stack")
-                    return False
-                if dev.status == NVMeDevice.STATUS_ONLINE:
-                    if dev.cluster_device_order < 0:
-                        dev.cluster_device_order = dev_order
-                        dev_order += 1
-                    device_events.device_create(dev)
+                t = threading.Thread(target=_create_storage_device_stack,
+                                     args=(snode.rpc_client(), dev, snode, False,))
+                thread_list.append(t)
                 new_devices.append(dev)
+                t.start()
+        else:
+            logger.error("Failed to create partitions")
+            return False
 
-    snode.nvme_devices = new_devices
+    for thread in thread_list:
+        thread.join()
+
+    # assign device order
+    dev_order = get_next_cluster_device_order(db_controller, snode.cluster_id)
+    for nvme in new_devices:
+        if nvme.status == NVMeDevice.STATUS_ONLINE:
+            if nvme.cluster_device_order < 0:
+                nvme.cluster_device_order = dev_order
+                dev_order += 1
+        device_events.device_create(nvme)
+
+    # create jm device
+    jm_devices = []
+    bdevs_names = [d['name'] for d in snode.rpc_client().get_bdevs()]
+    for nvme in new_devices:
+        if nvme.status == NVMeDevice.STATUS_ONLINE:
+            dev_part = f"{nvme.nvme_bdev[:-2]}p1"
+            if dev_part in bdevs_names:
+                if dev_part not in jm_devices:
+                    jm_devices.append(dev_part)
 
     if jm_devices:
-        jm_device = _create_jm_stack_on_raid(rpc_client, jm_devices, snode, after_restart=False)
+        jm_device = _create_jm_stack_on_raid(snode.rpc_client(), jm_devices, snode, after_restart=False)
         if not jm_device:
             logger.error("Failed to create JM device")
             return False
 
         snode.jm_device = jm_device
 
+    snode.nvme_devices = new_devices
     return True
 
 
@@ -712,6 +718,8 @@ def _connect_to_remote_devs(
         allowed_node_statuses.append(StorageNode.STATUS_RESTARTING)
         allowed_dev_statuses.append(NVMeDevice.STATUS_UNAVAILABLE)
 
+    devices_to_connect = []
+    connect_threads = []
     nodes = db_controller.get_storage_nodes_by_cluster_id(this_node.cluster_id)
     # connect to remote devs
     for node_index, node in enumerate(nodes):
@@ -726,12 +734,29 @@ def _connect_to_remote_devs(
 
             if not dev.alceml_bdev:
                 raise ValueError(f"device alceml bdev not found!, {dev.get_id()}")
+            devices_to_connect.append(dev)
+            t = threading.Thread(
+                target=connect_device,
+                args=(f"remote_{dev.alceml_bdev}", dev, this_node, node_bdev_names, reattach,))
+            connect_threads.append(t)
+            t.start()
 
-            dev.remote_bdev = connect_device(
-                    f"remote_{dev.alceml_bdev}", dev, this_node,
-                    bdev_names=node_bdev_names, reattach=reattach,
-            )
-            remote_devices.append(dev)
+    for t in connect_threads:
+        t.join()
+
+    node_bdevs = rpc_client.get_bdevs()
+    if node_bdevs:
+        node_bdev_names = [b['name'] for b in node_bdevs]
+
+    for dev in devices_to_connect:
+        for bdev in node_bdev_names:
+            if bdev.startswith(f"remote_{dev.alceml_bdev}"):
+                dev.remote_bdev = bdev
+                break
+        if not dev.remote_bdev:
+            logger.error(f"Failed to connect to remote device {dev.alceml_name}")
+            continue
+        remote_devices.append(dev)
 
     return remote_devices
 
@@ -1009,7 +1034,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list,
                 namespace, mgmt_ip, rpc_port, rpc_user, rpc_pass,
                 multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED,
                 timeout=constants.SPDK_PROXY_TIMEOUT,
-                ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode)
+                ssd_pcie=ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=cluster_id)
             time.sleep(5)
 
         except Exception as e:
@@ -1094,8 +1119,6 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list,
         snode.active_tcp=active_tcp
         snode.active_rdma=active_rdma
 
-        if 'cpu_count' in node_info:
-            snode.cpu = node_info['cpu_count']
         if 'cpu_hz' in node_info:
             snode.cpu_hz = node_info['cpu_hz']
         if 'memory' in node_info:
@@ -1103,6 +1126,7 @@ def add_node(cluster_id, node_addr, iface_name,data_nics_list,
         if 'hugepages' in node_info:
             snode.hugepages = node_info['hugepages']
 
+        snode.cpu = len(utils.hexa_to_cpu_list(spdk_cpu_mask))
         snode.l_cores = l_cores or ""
         snode.spdk_cpu_mask = spdk_cpu_mask or ""
         snode.spdk_mem = minimum_hp_memory
@@ -1465,7 +1489,7 @@ def remove_storage_node(node_id, force_remove=False, force_migrate=False):
         if health_controller._check_node_api(snode.mgmt_ip):
             logger.info("Stopping SPDK container")
             snode_api = SNodeClient(snode.api_endpoint, timeout=20)
-            snode_api.spdk_process_kill(snode.rpc_port)
+            snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id)
             snode_api.leave_swarm()
             pci_address = []
             for dev in snode.nvme_devices:
@@ -1687,7 +1711,7 @@ def restart_storage_node(
             snode.l_cores, snode.spdk_mem, snode.spdk_image, spdk_debug, cluster_ip, fdb_connection,
             snode.namespace, snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password,
             multi_threading_enabled=constants.SPDK_PROXY_MULTI_THREADING_ENABLED, timeout=constants.SPDK_PROXY_TIMEOUT,
-            ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode)
+            ssd_pcie=snode.ssd_pcie, total_mem=total_mem, system_mem=minimum_sys_memory, cluster_mode=cluster.mode, cluster_id=snode.cluster_id)
 
     except Exception as e:
         logger.error(e)
@@ -1894,7 +1918,6 @@ def restart_storage_node(
         return False
     if snode.enable_ha_jm:
         snode.remote_jm_devices = _connect_to_remote_jm_devs(snode)
-    snode.health_check = True
     snode.lvstore_status = ""
     snode.write_to_db(db_controller.kv_store)
 
@@ -1991,7 +2014,7 @@ def restart_storage_node(
                     logger.error('Failed to connect to remote devices')
                     return False
                 node.write_to_db(kv_store)
-                    
+
 
             logger.info("Sending device status event")
             snode = db_controller.get_storage_node_by_id(snode.get_id())
@@ -2149,21 +2172,6 @@ def list_storage_devices(node_id, is_json):
             "Health": snode.jm_device.health_check
         })
 
-    for jm_id in snode.jm_ids:
-        try:
-            jm_device = db_controller.get_jm_device_by_id(jm_id)
-        except KeyError:
-            continue
-
-        jm_devices.append({
-            "UUID": jm_device.uuid,
-            "Name": jm_device.device_name,
-            "Size": utils.humanbytes(jm_device.size),
-            "Status": jm_device.status,
-            "IO Err": jm_device.io_error,
-            "Health": jm_device.health_check
-        })
-
     for device in snode.remote_devices:
         logger.debug(device)
         logger.debug("*" * 20)
@@ -2262,7 +2270,7 @@ def shutdown_storage_node(node_id, force=False):
 
     logger.info("Stopping SPDK")
     try:
-        SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port)
+        SNodeClient(snode.api_endpoint, timeout=10, retry=10).spdk_process_kill(snode.rpc_port, snode.cluster_id)
     except SNodeClientException:
         logger.error('Failed to kill SPDK')
         return False
@@ -2477,20 +2485,11 @@ def resume_storage_node(node_id):
 def get_node_capacity(node_id, history, records_count=20, parse_sizes=True):
     db_controller = DBController()
     try:
-        this_node = db_controller.get_storage_node_by_id(node_id)
+        node = db_controller.get_storage_node_by_id(node_id)
     except KeyError:
         logger.error("Storage node Not found")
         return
 
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records = db_controller.get_node_capacity(this_node, records_number)
     cap_stats_keys = [
         "date",
         "size_total",
@@ -2500,6 +2499,8 @@ def get_node_capacity(node_id, history, records_count=20, parse_sizes=True):
         "size_util",
         "size_prov_util",
     ]
+    prom_client = PromClient(node.cluster_id)
+    records = prom_client.get_node_metrics(node_id, cap_stats_keys, history)
     new_records = utils.process_records(records, records_count, keys=cap_stats_keys)
 
     if not parse_sizes:
@@ -2526,17 +2527,6 @@ def get_node_iostats_history(node_id, history, records_count=20, parse_sizes=Tru
     except KeyError:
         logger.error("node not found")
         return False
-
-    if history:
-        records_number = utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    records = db_controller.get_node_stats(node, records_number)
-
     io_stats_keys = [
         "date",
         "read_bytes",
@@ -2574,6 +2564,8 @@ def get_node_iostats_history(node_id, history, records_count=20, parse_sizes=Tru
                 "write_latency_ticks",
             ]
         )
+    prom_client = PromClient(node.cluster_id)
+    records = prom_client.get_node_metrics(node_id, io_stats_keys, history)
     # combine records
     new_records = utils.process_records(records, records_count, keys=io_stats_keys)
 
@@ -2987,7 +2979,6 @@ def set_node_status(node_id, status, reconnect_on_online=True):
             return False
         if snode.enable_ha_jm:
             snode.remote_jm_devices = _connect_to_remote_jm_devs(snode)
-        snode.health_check = True
         snode.write_to_db(db_controller.kv_store)
         distr_controller.send_cluster_map_to_node(snode)
 
@@ -3190,6 +3181,13 @@ def recreate_lvstore(snode, force=False):
             port_type = "tcp"
             if sec_node.active_rdma:
                 port_type = "udp"
+
+            ret = sec_node.wait_for_jm_rep_tasks_to_finish(snode.jm_vuid)
+            if not ret:
+                msg = f"JM replication task found for jm {snode.jm_vuid}"
+                logger.error(msg)
+                storage_events.jm_repl_tasks_found(sec_node, snode.jm_vuid)
+
             fw_api.firewall_set_port(snode.lvol_subsys_port, port_type, "block", sec_node.rpc_port)
             tcp_ports_events.port_deny(sec_node, snode.lvol_subsys_port)
 
@@ -3226,7 +3224,7 @@ def recreate_lvstore(snode, force=False):
     def _kill_app():
         storage_events.snode_restart_failed(snode)
         snode_api = SNodeClient(snode.api_endpoint, timeout=5, retry=5)
-        snode_api.spdk_process_kill(snode.rpc_port)
+        snode_api.spdk_process_kill(snode.rpc_port, snode.cluster_id)
         set_node_status(snode.get_id(), StorageNode.STATUS_OFFLINE)
 
     # If LVol Store recovery failed then stop spdk process
@@ -3612,10 +3610,20 @@ def create_lvstore(snode, ndcs, npcs, distr_bs, distr_chunk_bs, page_size_in_blo
 
         sec_node.write_to_db()
 
+    storage_events.node_ports_changed(snode)
     return True
 
 
 def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None):
+    def _create_distr(snode, name, params):
+        try:
+            rpc_client.bdev_distrib_create(**params)
+        except Exception:
+            logger.error("Failed to create bdev distrib")
+        ret = distr_controller.send_cluster_map_to_distr(snode, name)
+        if not ret:
+            logger.error("Failed to send cluster map")
+
     rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
     db_controller = DBController()
     cluster = db_controller.get_cluster_by_id(snode.cluster_id)
@@ -3632,11 +3640,11 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None):
     else:
         node_bdev_names = []
 
+    thread_list = []
     for bdev in stack:
         type = bdev['type']
         name = bdev['name']
         params = bdev['params']
-
         if name in node_bdev_names:
             continue
 
@@ -3652,23 +3660,21 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None):
                 snode.distrib_cpu_index = (snode.distrib_cpu_index + 1) % len(snode.distrib_cpu_cores)
 
             params['full_page_unmap'] = cluster.full_page_unmap
-            ret = rpc_client.bdev_distrib_create(**params)
-            if ret:
-                ret = distr_controller.send_cluster_map_to_distr(snode, name)
-                if not ret:
-                    return False, "Failed to send cluster map"
-                # time.sleep(1)
+            t = threading.Thread(target=_create_distr, args=(snode, name, params,))
+            thread_list.append(t)
+            t.start()
+            ret = True
 
         elif type == "bdev_lvstore" and lvstore_stack and not primary_node:
-            ret = rpc_client.create_lvstore(**params)
-            # if ret and snode.jm_vuid > 0:
-            #     rpc_client.bdev_lvol_set_lvs_ops(snode.lvstore, snode.jm_vuid, snode.lvol_subsys_port)
+                ret = rpc_client.create_lvstore(**params)
 
         elif type == "bdev_ptnonexcl":
             ret = rpc_client.bdev_PT_NoExcl_create(**params)
 
         elif type == "bdev_raid":
-
+            if thread_list:
+                for t in thread_list:
+                    t.join()
             distribs_list = bdev["distribs_list"]
             strip_size_kb = params["strip_size_kb"]
             ret = rpc_client.bdev_raid_create(name, distribs_list, strip_size_kb=strip_size_kb)
@@ -3686,6 +3692,9 @@ def _create_bdev_stack(snode, lvstore_stack=None, primary_node=None):
                 _remove_bdev_stack(created_bdevs[::-1], rpc_client)
             return False, f"Failed to create BDev: {name}"
 
+    if thread_list:
+        for t in thread_list:
+            t.join()
     return True, None
 
 
diff --git a/simplyblock_core/utils/__init__.py b/simplyblock_core/utils/__init__.py
index 941414708..1f086dc2d 100644
--- a/simplyblock_core/utils/__init__.py
+++ b/simplyblock_core/utils/__init__.py
@@ -12,8 +12,12 @@
 import time
 import socket
 from typing import Union, Any, Optional, Tuple
+from docker import DockerClient
 from kubernetes import client, config
-from kubernetes.client import ApiException
+from kubernetes.client import ApiException, V1Deployment, V1DeploymentSpec, V1ObjectMeta, \
+    V1PodTemplateSpec, V1PodSpec, V1Container, V1EnvVar, V1VolumeMount, V1Volume, V1ConfigMapVolumeSource, \
+    V1LabelSelector, V1ResourceRequirements
+
 import docker
 from prettytable import PrettyTable
 from docker.errors import APIError, DockerException, ImageNotFound, NotFound
@@ -725,7 +729,13 @@ def convert_size(size: Union[int, str], unit: str, round_up: bool = False) -> in
     raw = size / (base ** exponent)
     return math.ceil(raw) if round_up else int(raw)
 
-
+def first_six_chars(s: str) -> str:
+    """
+    Returns the first six characters of a given string.
+    If the string is shorter than six characters, returns the entire string.
+    """
+    return s[:6]
+    
 def nearest_upper_power_of_2(n):
     # Check if n is already a power of 2
     if (n & (n - 1)) == 0:
@@ -1093,7 +1103,7 @@ def addNvmeDevices(rpc_client, snode, devs):
             serial_number = nvme_driver_data['ctrlr_data']['serial_number']
             if snode.id_device_by_nqn:
                 if "ns_data" in nvme_driver_data:
-                    serial_number = nvme_driver_data['pci_address'] + nvme_driver_data['ns_data']['id']
+                    serial_number = nvme_driver_data['pci_address'] + str(nvme_driver_data['ns_data']['id'])
                 else:
                     logger.error(f"No subsystem nqn found for device: {nvme_driver_data['pci_address']}")
 
@@ -2031,17 +2041,135 @@ def patch_prometheus_configmap(username: str, password: str):
     load_kube_config_with_fallback()
     v1 = client.CoreV1Api()
 
-    cm = v1.read_namespaced_config_map(name="sbcli-simplyblock-prometheus-config", namespace=constants.K8S_NAMESPACE)
-    prometheus_yml = cm.data.get("prometheus.yml", "")
+    try:
+        cm = v1.read_namespaced_config_map(
+            name="sbcli-simplyblock-prometheus-config",
+            namespace=constants.K8S_NAMESPACE
+        )
+    except client.exceptions.ApiException as e:
+        logger.error(f"Failed to read ConfigMap: {e}")
+        return False
+
+    try:
+        prometheus_yml = cm.data.get("prometheus.yml", "")
+        if not prometheus_yml:
+            logger.error("prometheus.yml key not found in ConfigMap.")
+            return False
 
-    prometheus_yml = re.sub(r"username:*", f"username: '{username}'", prometheus_yml)
-    prometheus_yml = re.sub(r"password:*", f"password: '{password}'", prometheus_yml)
+        try:
+            prometheus_yml = re.sub(r"username:.*", f"username: '{username}'", prometheus_yml)
+            prometheus_yml = re.sub(r"password:.*", f"password: '{password}'", prometheus_yml)
+        except re.error as e:
+            logger.error(f"Regex error while patching Prometheus YAML: {e}")
+            return False
 
-    patch_body = {
-        "data": {
-            "prometheus.yml": prometheus_yml
+        patch_body = {
+            "data": {
+                "prometheus.yml": prometheus_yml
+            }
         }
-    }
 
-    v1.patch_namespaced_config_map(name="sbcli-simplyblock-prometheus-config", namespace=constants.K8S_NAMESPACE, body=patch_body)
-    logger.info("Patched sbcli-simplyblock-prometheus-config ConfigMap with new credentials.")
+        v1.patch_namespaced_config_map(
+            name="sbcli-simplyblock-prometheus-config",
+            namespace=constants.K8S_NAMESPACE,
+            body=patch_body
+        )
+
+        logger.info("Patched sbcli-simplyblock-prometheus-config ConfigMap with new credentials.")
+        return True
+
+    except client.exceptions.ApiException as e:
+        logger.error(f"Failed to patch ConfigMap: {e}")
+        return False
+
+    except Exception as e:
+        logger.error(f"Unexpected error while patching ConfigMap: {e}")
+        return False
+
+
+def create_docker_service(cluster_docker: DockerClient, service_name: str, service_file: str, service_image: str):
+    logger.info(f"Creating service: {service_name}")
+    cluster_docker.services.create(
+        image=service_image,
+        command=service_file,
+        name=service_name,
+        mounts=["/etc/foundationdb:/etc/foundationdb"],
+        env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"],
+        networks=["host"],
+        constraints=["node.role == manager"],
+        labels={
+            "com.docker.stack.image": service_image,
+            "com.docker.stack.namespace": "app"}
+    )
+
+def create_k8s_service(namespace: str, deployment_name: str,
+                       container_name: str, service_file: str, container_image: str):
+
+    logger.info(f"Creating deployment: {deployment_name} in namespace {namespace}")
+    load_kube_config_with_fallback()
+    apps_v1 = client.AppsV1Api()
+
+    env_list = [
+        V1EnvVar(
+            name="SIMPLYBLOCK_LOG_LEVEL",
+            value_from={"config_map_key_ref": {"name": "simplyblock-config", "key": "LOG_LEVEL"}}
+        )
+    ]
+
+    volume_mounts = [
+        V1VolumeMount(
+            name="fdb-cluster-file",
+            mount_path="/etc/foundationdb/fdb.cluster",
+            sub_path="fdb.cluster"
+        )
+    ]
+
+    volumes = [
+        V1Volume(
+            name="fdb-cluster-file",
+            config_map=V1ConfigMapVolumeSource(
+                name="simplyblock-fdb-cluster-config",
+                items=[{"key": "cluster-file", "path": "fdb.cluster"}]
+            )
+        )
+    ]
+
+    container = V1Container(
+        name=container_name,
+        image=container_image,
+        command=["python", service_file], 
+        env=env_list,
+        volume_mounts=volume_mounts,
+        resources=V1ResourceRequirements(
+            requests={"cpu": "200m", "memory": "256Mi"},
+            limits={"cpu": "400m", "memory": "1Gi"}
+        )
+    )
+
+    pod_spec = V1PodSpec(
+        containers=[container],
+        volumes=volumes,
+        host_network=True,
+        dns_policy="ClusterFirstWithHostNet"
+    )
+
+    pod_template = V1PodTemplateSpec(
+        metadata=V1ObjectMeta(labels={"app": deployment_name}),
+        spec=pod_spec
+    )
+
+    deployment_spec = V1DeploymentSpec(
+        replicas=1,
+        selector=V1LabelSelector(match_labels={"app": deployment_name}),
+        template=pod_template
+    )
+
+    deployment = V1Deployment(
+        api_version="apps/v1",
+        kind="Deployment",
+        metadata=V1ObjectMeta(name=deployment_name, namespace=namespace),
+        spec=deployment_spec
+    )
+
+    apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
+    logger.info(f"Deployment {deployment_name} created successfully.")
diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py
index 8e18fc276..d1ee4f9f0 100644
--- a/simplyblock_web/api/internal/storage_node/docker.py
+++ b/simplyblock_web/api/internal/storage_node/docker.py
@@ -142,6 +142,7 @@ class SPDKParams(BaseModel):
     spdk_image: Optional[str] = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE)
     cluster_ip: Optional[str] = Field(default=None, pattern=utils.IP_PATTERN)
     cluster_mode: str
+    cluster_id: str
 
 
 @api.post('/spdk_process_start', responses={
diff --git a/simplyblock_web/api/internal/storage_node/kubernetes.py b/simplyblock_web/api/internal/storage_node/kubernetes.py
index be3193138..d5e98eb1d 100644
--- a/simplyblock_web/api/internal/storage_node/kubernetes.py
+++ b/simplyblock_web/api/internal/storage_node/kubernetes.py
@@ -268,6 +268,7 @@ class SPDKParams(BaseModel):
     spdk_image: str = Field(constants.SIMPLY_BLOCK_SPDK_ULTRA_IMAGE)
     cluster_ip: str = Field(pattern=utils.IP_PATTERN)
     cluster_mode: str
+    cluster_id: str
 
 
 @api.post('/spdk_process_start', responses={
@@ -286,9 +287,10 @@ def spdk_process_start(body: SPDKParams):
 
     total_mem_mib = core_utils.convert_size(core_utils.parse_size(body.total_mem), 'MB') if body.total_mem else ""
 
-    if _is_pod_up(body.rpc_port) or _is_pod_present(body.rpc_port):
+    first_six_cluster_id = core_utils.first_six_chars(body.cluster_id)
+    if _is_pod_up(body.rpc_port, first_six_cluster_id) or _is_pod_present(body.rpc_port, first_six_cluster_id):
         logger.info("SPDK pod found, removing...")
-        query = utils.RPCPortParams(rpc_port=body.rpc_port)
+        query = utils.RPCPortParams(rpc_port=body.rpc_port, cluster_id=body.cluster_id)
         spdk_process_kill(query)
 
     node_prepration_job_name = "snode-spdk-job-"
@@ -351,6 +353,7 @@ def spdk_process_start(body: SPDKParams):
             'SIMPLYBLOCK_DOCKER_IMAGE': constants.SIMPLY_BLOCK_DOCKER_IMAGE,
             'GRAYLOG_SERVER_IP': body.cluster_ip,
             'MODE': body.cluster_mode,
+            'CLUSTER_ID': first_six_cluster_id,
             'SSD_PCIE': ssd_pcie_params,
             'PCI_ALLOWED': ssd_pcie_list,
             'TOTAL_HP': total_mem_mib
@@ -420,9 +423,35 @@ def spdk_process_start(body: SPDKParams):
             logger.info(f"Job deleted: '{core_resp.metadata.name}' in namespace '{namespace}")
 
         elif core_isolate and openshift:
+            batch_v1 = core_utils.get_k8s_batch_client()
+            try:
+                batch_v1.read_namespaced_job(
+                    name=node_prepration_core_name,
+                    namespace=namespace
+                )
+                logger.info(f"Existing Job '{node_prepration_core_name}' found — deleting it first...")
+
+                batch_v1.delete_namespaced_job(
+                    name=node_prepration_core_name,
+                    namespace=namespace,
+                    body=V1DeleteOptions(
+                        propagation_policy='Foreground',
+                        grace_period_seconds=0
+                    )
+                )
+
+                node_utils_k8s.wait_for_job_deletion(node_prepration_core_name, namespace)
+
+                logger.info(f"Old Job '{node_prepration_core_name}' fully deleted.")
+
+            except ApiException as e:
+                if e.status == 404:
+                    logger.info(f"No pre-existing Job '{node_prepration_core_name}' found. Proceeding.")
+                else:
+                    raise
+                
             core_template = env.get_template('oc_storage_core_isolation.yaml.j2')
             core_yaml = yaml.safe_load(core_template.render(values))
-            batch_v1 = core_utils.get_k8s_batch_client()
             core_resp = batch_v1.create_namespaced_job(namespace=namespace, body=core_yaml)
             msg = f"Job created: '{core_resp.metadata.name}' in namespace '{namespace}"
             logger.info(msg)
@@ -463,7 +492,11 @@ def spdk_process_kill(query: utils.RPCPortParams):
     k8s_core_v1 = core_utils.get_k8s_core_client()
     try:
         namespace = node_utils_k8s.get_namespace()
-        pod_name = f"snode-spdk-pod-{query.rpc_port}"
+        if not query.cluster_id:
+            return utils.get_response(False, "param required: cluster_id")
+
+        first_six_cluster_id = core_utils.first_six_chars(query.cluster_id)
+        pod_name = f"snode-spdk-pod-{query.rpc_port}-{first_six_cluster_id}"
         resp = k8s_core_v1.delete_namespaced_pod(pod_name, namespace)
         retries = 10
         while retries > 0:
@@ -486,9 +519,9 @@ def spdk_process_kill(query: utils.RPCPortParams):
     return utils.get_response(True)
 
 
-def _is_pod_up(rpc_port):
+def _is_pod_up(rpc_port, cluster_id):
     k8s_core_v1 = core_utils.get_k8s_core_client()
-    pod_name = f"snode-spdk-pod-{rpc_port}"
+    pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}"
     try:
         resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace())
         for pod in resp.items:
@@ -502,9 +535,9 @@ def _is_pod_up(rpc_port):
         return False
     return False
 
-def _is_pod_present(rpc_port):
+def _is_pod_present(rpc_port, cluster_id):
     k8s_core_v1 = core_utils.get_k8s_core_client()
-    pod_name = f"snode-spdk-pod-{rpc_port}"
+    pod_name = f"snode-spdk-pod-{rpc_port}-{cluster_id}"
     try:
         resp = k8s_core_v1.list_namespaced_pod(node_utils_k8s.get_namespace())
         for pod in resp.items:
@@ -525,7 +558,11 @@ def _is_pod_present(rpc_port):
     })}}},
 })
 def spdk_process_is_up(query: utils.RPCPortParams):
-    if _is_pod_up(query.rpc_port):
+    if not query.cluster_id:
+        return utils.get_response(False, "param required: cluster_id")
+
+    first_six_cluster_id = core_utils.first_six_chars(query.cluster_id)
+    if _is_pod_up(query.rpc_port, first_six_cluster_id):
         return utils.get_response(True)
     else:
         return utils.get_response(False, "SPDK container is not running")
diff --git a/simplyblock_web/api/v1/__init__.py b/simplyblock_web/api/v1/__init__.py
index 4bcc5ba41..084a737cc 100644
--- a/simplyblock_web/api/v1/__init__.py
+++ b/simplyblock_web/api/v1/__init__.py
@@ -1,9 +1,12 @@
 import logging
+import os
 
+from flask import jsonify
 from flask import Flask
 
 from simplyblock_web.auth_middleware import token_required
 from simplyblock_web import utils
+from simplyblock_core import constants
 
 from . import cluster
 from . import mgmt_node
@@ -39,3 +42,31 @@ def before_request():
 @api.route('/', methods=['GET'])
 def status():
     return utils.get_response("Live")
+
+@api.route('/health/fdb', methods=['GET'])
+def health_fdb():
+    fdb_cluster_file = constants.KVD_DB_FILE_PATH
+
+    if not os.path.exists(fdb_cluster_file):
+        return jsonify({
+            "fdb_connected": False,
+            "message": "FDB cluster file not found"
+        }), 503
+
+    try:
+        with open(fdb_cluster_file, 'r') as f:
+            cluster_data = f.read().strip()
+            if not cluster_data:
+                return jsonify({
+                    "fdb_connected": False,
+                    "message": "FDB cluster file is empty"
+                }), 503
+    except Exception as e:
+        return jsonify({
+            "fdb_connected": False,
+            "message": f"Failed to read FDB cluster file: {str(e)}"
+        }), 503
+
+    return jsonify({
+        "fdb_connected": True,
+    }), 200
diff --git a/simplyblock_web/api/v1/cluster.py b/simplyblock_web/api/v1/cluster.py
index 698d9582d..2447cf958 100644
--- a/simplyblock_web/api/v1/cluster.py
+++ b/simplyblock_web/api/v1/cluster.py
@@ -60,6 +60,54 @@ def add_cluster():
     ))
 
 
+@bp.route('/cluster/create_first', methods=['POST'])
+def create_first_cluster():
+    cl_data = request.get_json()
+
+    if db.get_clusters():
+        return utils.get_response_error("Cluster found!", 400)
+
+    blk_size = 512
+    if 'blk_size' in cl_data:
+        if cl_data['blk_size'] not in [512, 4096]:
+            return utils.get_response_error("blk_size can be 512 or 4096", 400)
+        else:
+            blk_size = cl_data['blk_size']
+    page_size_in_blocks = cl_data.get('page_size_in_blocks', 2097152)
+    distr_ndcs = cl_data.get('distr_ndcs', 1)
+    distr_npcs = cl_data.get('distr_npcs', 1)
+    distr_bs = cl_data.get('distr_bs', 4096)
+    distr_chunk_bs = cl_data.get('distr_chunk_bs', 4096)
+    ha_type = cl_data.get('ha_type', 'single')
+    enable_node_affinity = cl_data.get('enable_node_affinity', False)
+    qpair_count = cl_data.get('qpair_count', 256)
+    name = cl_data.get('name', None)
+    fabric = cl_data.get('fabric', "tcp")
+    cap_warn = cl_data.get('cap_warn', 0)
+    cap_crit = cl_data.get('cap_crit', 0)
+    prov_cap_warn = cl_data.get('prov_cap_warn', 0)
+    prov_cap_crit = cl_data.get('prov_cap_crit', 0)
+    max_queue_size = cl_data.get('max_queue_size', 128)
+    inflight_io_threshold = cl_data.get('inflight_io_threshold', 4)
+    strict_node_anti_affinity = cl_data.get('strict_node_anti_affinity', False)
+    is_single_node = cl_data.get('is_single_node', False)
+    cluster_ip = cl_data.get('cluster_ip', None)
+    grafana_secret = cl_data.get('grafana_secret', None)
+
+    try:
+        cluster_id = cluster_ops.add_cluster(
+            blk_size, page_size_in_blocks, cap_warn, cap_crit, prov_cap_warn, prov_cap_crit,
+            distr_ndcs, distr_npcs, distr_bs, distr_chunk_bs, ha_type, enable_node_affinity,
+            qpair_count, max_queue_size, inflight_io_threshold, strict_node_anti_affinity, is_single_node, name, fabric,
+            cluster_ip=cluster_ip, grafana_secret=grafana_secret)
+        if cluster_id:
+            return utils.get_response(db.get_cluster_by_id(cluster_id).to_dict())
+        else:
+            return utils.get_response(False, "Failed to create cluster", 400)
+    except Exception as e:
+        return utils.get_response(False, str(e), 404)
+
+
 @bp.route('/cluster', methods=['GET'], defaults={'uuid': None})
 @bp.route('/cluster/<string:uuid>', methods=['GET'])
 def list_clusters(uuid):
diff --git a/simplyblock_web/api/v1/pool.py b/simplyblock_web/api/v1/pool.py
index a24a9e9b7..3b4fe5f72 100644
--- a/simplyblock_web/api/v1/pool.py
+++ b/simplyblock_web/api/v1/pool.py
@@ -184,21 +184,10 @@ def pool_iostats(uuid, history):
     except KeyError:
         return utils.get_response_error(f"Pool not found: {uuid}", 404)
 
-    if history:
-        records_number = core_utils.parse_history_param(history)
-        if not records_number:
-            logger.error(f"Error parsing history string: {history}")
-            return False
-    else:
-        records_number = 20
-
-    out = db.get_pool_stats(pool, records_number)
-    records_count = 20
-    new_records = core_utils.process_records(out, records_count)
-
+    data = pool_controller.get_io_stats(uuid, history)
     ret = {
         "object_data": pool.get_clean_dict(),
-        "stats": new_records or []
+        "stats": data or []
     }
     return utils.get_response(ret)
 
@@ -207,21 +196,13 @@ def pool_iostats(uuid, history):
 @bp.route('/pool/iostats-all-lvols/<string:pool_uuid>', methods=['GET'])
 def lvol_iostats(pool_uuid):
     try:
-        db.get_pool_by_id(pool_uuid)
+        pool = db.get_pool_by_id(pool_uuid)
     except KeyError:
         return utils.get_response_error(f"Pool not found: {pool_uuid}", 404)
 
-    ret = []
-    for lvol in db.get_lvols_by_pool_id(pool_uuid):
-
-        records_list = db.get_lvol_stats(lvol, limit=1)
-
-        if records_list:
-            data = records_list[0].get_clean_dict()
-        else:
-            data = {}
-        ret.append({
-            "object_data": lvol.get_clean_dict(),
-            "stats": data
-        })
+    data = pool_controller.get_capacity(pool_uuid)
+    ret = {
+        "object_data": pool.get_clean_dict(),
+        "stats": data or []
+    }
     return utils.get_response(ret)
diff --git a/simplyblock_web/api/v2/__init__.py b/simplyblock_web/api/v2/__init__.py
index ff8511e1c..c3723cce6 100644
--- a/simplyblock_web/api/v2/__init__.py
+++ b/simplyblock_web/api/v2/__init__.py
@@ -10,6 +10,7 @@
 from . import pool
 from . import snapshot
 from . import storage_node
+from . import task
 
 from simplyblock_core.db_controller import DBController
 
@@ -37,7 +38,7 @@ def _verify_api_token(
 storage_node.api.include_router(storage_node.instance_api)
 
 cluster.instance_api.include_router(storage_node.api)
-
+cluster.instance_api.include_router(task.api)
 
 volume.api.include_router(volume.instance_api)
 pool.instance_api.include_router(volume.api)
diff --git a/simplyblock_web/api/v2/cluster.py b/simplyblock_web/api/v2/cluster.py
index 422766246..49f8a09e8 100644
--- a/simplyblock_web/api/v2/cluster.py
+++ b/simplyblock_web/api/v2/cluster.py
@@ -24,7 +24,7 @@ class _UpdateParams(BaseModel):
 
 
 class ClusterParams(BaseModel):
-    name: Optional[str] = None
+    name: str = ""
     blk_size: Literal[512, 4096] = 512
     page_size_in_blocks: int = Field(2097152, gt=0)
     cap_warn: util.Percent = 0
@@ -41,16 +41,21 @@ class ClusterParams(BaseModel):
     inflight_io_threshold: int = 4
     enable_node_affinity: bool = False
     strict_node_anti_affinity: bool = False
-
+    is_single_node: bool = False
+    fabric: str = "tcp"
+    cluster_ip: str = ""
+    grafana_secret: str = ""
 
 @api.get('/', name='clusters:list')
 def list() -> List[ClusterDTO]:
-    return [
-        ClusterDTO.from_model(cluster)
-        for cluster
-        in db.get_clusters()
-    ]
-
+    data = []
+    for cluster in db.get_clusters():
+        stat_obj = None
+        ret = db.get_cluster_capacity(cluster, 1)
+        if ret:
+            stat_obj = ret[0]
+        data.append(ClusterDTO.from_model(cluster, stat_obj))
+    return data
 
 @api.post('/', name='clusters:create', status_code=201, responses={201: {"content": None}})
 def add(request: Request, parameters: ClusterParams):
@@ -58,8 +63,8 @@ def add(request: Request, parameters: ClusterParams):
     if not cluster_id_or_false:
         raise ValueError('Failed to create cluster')
 
-    entity_url = request.app.url_path_for('get', cluster_id=cluster_id_or_false)
-    return Response(status_code=201, headers={'Location': entity_url})
+    cluster = db.get_cluster_by_id(cluster_id_or_false)
+    return ClusterDTO.from_model(cluster)
 
 
 instance_api = APIRouter(prefix='/{cluster_id}')
@@ -77,7 +82,11 @@ def _lookup_cluster(cluster_id: UUID):
 
 @instance_api.get('/', name='clusters:detail')
 def get(cluster: Cluster) -> ClusterDTO:
-    return ClusterDTO.from_model(cluster)
+    stat_obj = None
+    ret = db.get_cluster_capacity(cluster, 1)
+    if ret:
+        stat_obj = ret[0]
+    return ClusterDTO.from_model(cluster, stat_obj)
 
 
 class UpdatableClusterParameters(BaseModel):
diff --git a/simplyblock_web/api/v2/device.py b/simplyblock_web/api/v2/device.py
index 1c7b40d7e..4fa0949fb 100644
--- a/simplyblock_web/api/v2/device.py
+++ b/simplyblock_web/api/v2/device.py
@@ -18,10 +18,14 @@
 
 @api.get('/', name='clusters:storage_nodes:devices:list')
 def list(cluster: Cluster, storage_node: StorageNode) -> List[DeviceDTO]:
-    return [
-        DeviceDTO.from_model(device)
-        for device in storage_node.nvme_devices
-    ]
+    data = []
+    for device in storage_node.nvme_devices:
+        stat_obj = None
+        ret = db.get_device_stats(device, 1)
+        if ret:
+            stat_obj = ret[0]
+        data.append(DeviceDTO.from_model(device, stat_obj))
+    return data
 
 instance_api = APIRouter(prefix='/{device_id}')
 
@@ -38,7 +42,11 @@ def _lookup_device(storage_node: StorageNode, device_id: UUID) -> NVMeDevice:
 
 @instance_api.get('/', name='clusters:storage_nodes:devices:detail')
 def get(cluster: Cluster, storage_node: StorageNode, device: Device) -> DeviceDTO:
-    return DeviceDTO.from_model(device)
+    stat_obj = None
+    ret = db.get_device_stats(device, 1)
+    if ret:
+        stat_obj = ret[0]
+    return DeviceDTO.from_model(device, stat_obj)
 
 
 @instance_api.delete('/', name='clusters:storage_nodes:devices:delete', status_code=204, responses={204: {"content": None}})
diff --git a/simplyblock_web/api/v2/dtos.py b/simplyblock_web/api/v2/dtos.py
index 54c1b5b01..62f1a94e1 100644
--- a/simplyblock_web/api/v2/dtos.py
+++ b/simplyblock_web/api/v2/dtos.py
@@ -12,11 +12,33 @@
 from simplyblock_core.models.nvme_device import NVMeDevice
 from simplyblock_core.models.pool import Pool
 from simplyblock_core.models.snapshot import SnapShot
+from simplyblock_core.models.stats import StatsObject
 from simplyblock_core.models.storage_node import StorageNode
 
 from . import util
 
 
+class CapacityStatDTO(BaseModel):
+    date: int
+    size_total: int
+    size_prov: int
+    size_used: int
+    size_free: int
+    size_util: int
+
+    @staticmethod
+    def from_model(model: StatsObject):
+        return CapacityStatDTO(
+            date=model.date,
+            size_total=model.size_total,
+            size_prov=model.size_prov,
+            size_used=model.size_used,
+            size_free=model.size_free,
+            size_util=model.size_util,
+        )
+
+
+
 class ClusterDTO(BaseModel):
     id: UUID
     name: Optional[str]
@@ -33,9 +55,10 @@ class ClusterDTO(BaseModel):
     node_affinity: bool
     anti_affinity: bool
     secret: str
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: Cluster):
+    def from_model(model: Cluster, stat_obj: Optional[StatsObject]=None):
         return ClusterDTO(
             id=UUID(model.get_id()),
             name=model.cluster_name,
@@ -52,6 +75,7 @@ def from_model(model: Cluster):
             node_affinity=model.enable_node_affinity,
             anti_affinity=model.strict_node_anti_affinity,
             secret=model.secret,
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
 
 
@@ -65,9 +89,10 @@ class DeviceDTO(BaseModel):
     nvmf_ips: List[IPv4Address]
     nvmf_nqn: str = ""
     nvmf_port: int = 0
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: NVMeDevice):
+    def from_model(model: NVMeDevice, stat_obj: Optional[StatsObject]=None):
         return DeviceDTO(
             id=UUID(model.get_id()),
             status=model.status,
@@ -78,6 +103,7 @@ def from_model(model: NVMeDevice):
             nvmf_ips=[IPv4Address(ip) for ip in model.nvmf_ip.split(',')],
             nvmf_nqn=model.nvmf_nqn,
             nvmf_port=model.nvmf_port,
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
 
 
@@ -107,9 +133,10 @@ class StoragePoolDTO(BaseModel):
     max_rw_mbytes: util.Unsigned
     max_r_mbytes: util.Unsigned
     max_w_mbytes: util.Unsigned
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: Pool):
+    def from_model(model: Pool, stat_obj: Optional[StatsObject]=None):
         return StoragePoolDTO(
             id=UUID(model.get_id()),
             name=model.pool_name,
@@ -120,6 +147,7 @@ def from_model(model: Pool):
             max_rw_mbytes=model.max_rw_mbytes_per_sec,
             max_r_mbytes=model.max_r_mbytes_per_sec,
             max_w_mbytes=model.max_w_mbytes_per_sec,
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
 
 
@@ -151,16 +179,22 @@ def from_model(model: SnapShot, request: Request, cluster_id, pool_id, volume_id
 
 
 class StorageNodeDTO(BaseModel):
-    id: UUID
+    uuid: UUID
     status: str
-    ip: IPv4Address
+    mgmt_ip: IPv4Address
+    health_check: bool
+    online_devices: str
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: StorageNode):
+    def from_model(model: StorageNode, stat_obj: Optional[StatsObject]=None):
         return StorageNodeDTO(
-            id=UUID(model.get_id()),
+            uuid=UUID(model.get_id()),
             status=model.status,
-            ip=IPv4Address(model.mgmt_ip),
+            mgmt_ip=IPv4Address(model.mgmt_ip),
+            health_check=model.health_check,
+            online_devices=f"{len(model.nvme_devices)}/{len([d for d in model.nvme_devices if d.status=='online'])}",
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
 
 
@@ -204,9 +238,10 @@ class VolumeDTO(BaseModel):
     max_rw_mbytes: util.Unsigned
     max_r_mbytes: util.Unsigned
     max_w_mbytes: util.Unsigned
+    capacity: CapacityStatDTO
 
     @staticmethod
-    def from_model(model: LVol, request: Request, cluster_id: str):
+    def from_model(model: LVol, request: Request, cluster_id: str, stat_obj: Optional[StatsObject]=None):
         return VolumeDTO(
             id=UUID(model.get_id()),
             name=model.lvol_name,
@@ -239,4 +274,5 @@ def from_model(model: LVol, request: Request, cluster_id: str):
             max_rw_mbytes=model.rw_mbytes_per_sec,
             max_r_mbytes=model.r_mbytes_per_sec,
             max_w_mbytes=model.w_mbytes_per_sec,
+            capacity=CapacityStatDTO.from_model(stat_obj if stat_obj else StatsObject()),
         )
diff --git a/simplyblock_web/api/v2/pool.py b/simplyblock_web/api/v2/pool.py
index c779f70ca..4ccae01ab 100644
--- a/simplyblock_web/api/v2/pool.py
+++ b/simplyblock_web/api/v2/pool.py
@@ -20,12 +20,15 @@
 
 @api.get('/', name='clusters:storage-pools:list')
 def list(cluster: Cluster) -> List[StoragePoolDTO]:
-    return [
-        StoragePoolDTO.from_model(pool)
-        for pool
-        in db.get_pools()
-        if pool.cluster_id == cluster.get_id()
-    ]
+    data = []
+    for pool in db.get_pools():
+        if pool.cluster_id == cluster.get_id():
+            stat_obj = None
+            ret = db.get_pool_stats(pool, 1)
+            if ret:
+                stat_obj = ret[0]
+            data.append(StoragePoolDTO.from_model(pool, stat_obj))
+    return data
 
 
 class StoragePoolParams(BaseModel):
@@ -54,9 +57,8 @@ def add(request: Request, cluster: Cluster, parameters: StoragePoolParams) -> Re
 
     if not id_or_false:
         raise ValueError('Failed to create pool')
-
-    entity_url = request.app.url_path_for('clusters:storage-pools:detail', cluster_id=cluster.get_id(), pool_id=id_or_false)
-    return Response(status_code=201, headers={'Location': entity_url})
+    pool = db.get_pool_by_id(id_or_false)
+    return pool.to_dict()
 
 
 instance_api = APIRouter(prefix='/{pool_id}')
@@ -74,7 +76,11 @@ def _lookup_storage_pool(pool_id: UUID) -> PoolModel:
 
 @instance_api.get('/', name='clusters:storage-pools:detail')
 def get(cluster: Cluster, pool: StoragePool) -> StoragePoolDTO:
-    return StoragePoolDTO.from_model(pool)
+    stat_obj = None
+    ret = db.get_pool_stats(pool, 1)
+    if ret:
+        stat_obj = ret[0]
+    return StoragePoolDTO.from_model(pool, stat_obj)
 
 
 @instance_api.delete('/', name='clusters:storage-pools:delete', status_code=204, responses={204: {"content": None}})
@@ -122,5 +128,5 @@ def update(cluster: Cluster, pool: StoragePool, parameters: UpdatableStoragePool
 
 @instance_api.get('/iostats', name='clusters:storage-pools:iostats')
 def iostats(cluster: Cluster, pool: StoragePool, limit: int = 20):
-    records = db.get_pool_stats(pool, limit)
-    return core_utils.process_records(records, 20)
+    data = pool_controller.get_io_stats(pool.get_id(), history="")
+    return core_utils.process_records(data, 20)
diff --git a/simplyblock_web/api/v2/storage_node.py b/simplyblock_web/api/v2/storage_node.py
index f93fa5250..aa7923d36 100644
--- a/simplyblock_web/api/v2/storage_node.py
+++ b/simplyblock_web/api/v2/storage_node.py
@@ -22,11 +22,14 @@
 
 @api.get('/', name='clusters:storage-nodes:list')
 def list(cluster: Cluster) -> List[StorageNodeDTO]:
-    return [
-        StorageNodeDTO.from_model(storage_node)
-        for storage_node
-        in db.get_storage_nodes_by_cluster_id(cluster.get_id())
-    ]
+    data = []
+    for storage_node in db.get_storage_nodes_by_cluster_id(cluster.get_id()):
+        node_stat_obj = None
+        ret = db.get_node_capacity(storage_node, 1)
+        if ret:
+            node_stat_obj = ret[0]
+        data.append(StorageNodeDTO.from_model(storage_node, node_stat_obj))
+    return data
 
 
 class StorageNodeParams(BaseModel):
@@ -35,9 +38,8 @@ class StorageNodeParams(BaseModel):
     max_snapshots: int = Field(500)
     ha_jm: bool = Field(True)
     test_device: bool = Field(False)
-    spdk_image: Optional[str]
+    spdk_image: Optional[str] = Field("")
     spdk_debug: bool = Field(False)
-    full_page_unmap: bool = Field(False)
     data_nics: List[str] = Field([])
     namespace: str = Field('default')
     jm_percent: util.Percent = Field(3)
@@ -47,7 +49,7 @@ class StorageNodeParams(BaseModel):
 
 
 @api.post('/', name='clusters:storage-nodes:create', status_code=201, responses={201: {"content": None}})
-def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Response:
+def add(request: Request, cluster: Cluster, parameters: StorageNodeParams):
     task_id_or_false = tasks_controller.add_node_add_task(
         cluster.get_id(),
         {
@@ -65,14 +67,11 @@ def add(request: Request, cluster: Cluster, parameters: StorageNodeParams) -> Re
             'enable_test_device': parameters.test_device,
             'namespace': parameters.namespace,
             'enable_ha_jm': parameters.ha_jm,
-            'full_page_unmap': parameters.full_page_unmap,
         }
     )
     if not task_id_or_false:
         raise ValueError('Failed to create add-node task')
-
-    task_url = request.app.url_path_for('clusters:storage-nodes:detail', cluster_id=cluster.get_id(), task_id=task_id_or_false)
-    return Response(status_code=201, headers={'Location': task_url})
+    return task_id_or_false
 
 
 instance_api = APIRouter(prefix='/{storage_node_id}')
@@ -90,7 +89,11 @@ def _lookup_storage_node(storage_node_id: UUID) -> StorageNodeModel:
 
 @instance_api.get('/', name='clusters:storage-nodes:detail')
 def get(cluster: Cluster, storage_node: StorageNode):
-    return StorageNodeDTO.from_model(storage_node)
+    node_stat_obj = None
+    ret = db.get_node_capacity(storage_node, 1)
+    if ret:
+        node_stat_obj = ret[0]
+    return StorageNodeDTO.from_model(storage_node, node_stat_obj)
 
 
 @instance_api.delete('/', name='clusters:storage-nodes:delete')
diff --git a/simplyblock_web/api/v2/task.py b/simplyblock_web/api/v2/task.py
index c17bec3b7..83890640f 100644
--- a/simplyblock_web/api/v2/task.py
+++ b/simplyblock_web/api/v2/task.py
@@ -40,3 +40,5 @@ def _lookup_task(task_id: UUID) -> JobSchedule:
 @instance_api.get('/', name='clusters:tasks:detail')
 def get(cluster: Cluster, task: Task) -> TaskDTO:
     return TaskDTO.from_model(task)
+
+api.include_router(instance_api)
diff --git a/simplyblock_web/api/v2/volume.py b/simplyblock_web/api/v2/volume.py
index 698788718..6755a1149 100644
--- a/simplyblock_web/api/v2/volume.py
+++ b/simplyblock_web/api/v2/volume.py
@@ -21,11 +21,14 @@
 
 @api.get('/', name='clusters:storage-pools:volumes:list')
 def list(request: Request, cluster: Cluster, pool: StoragePool) -> List[VolumeDTO]:
-    return [
-        VolumeDTO.from_model(lvol, request, cluster.get_id())
-        for lvol
-        in db.get_lvols_by_pool_id(pool.get_id())
-    ]
+    data = []
+    for lvol in db.get_lvols_by_pool_id(pool.get_id()):
+        stat_obj = None
+        ret = db.get_lvol_stats(lvol, 1)
+        if ret:
+            stat_obj = ret[0]
+        data.append(VolumeDTO.from_model(lvol, request, cluster.get_id(), stat_obj))
+    return data
 
 
 class _CreateParams(BaseModel):
@@ -122,7 +125,11 @@ def _lookup_volume(volume_id: UUID) -> LVol:
 
 @instance_api.get('/', name='clusters:storage-pools:volumes:detail')
 def get(request: Request, cluster: Cluster, pool: StoragePool, volume: Volume) -> VolumeDTO:
-    return VolumeDTO.from_model(volume, request, cluster.get_id())
+    stat_obj = None
+    ret = db.get_lvol_stats(volume, 1)
+    if ret:
+        stat_obj = ret[0]
+    return VolumeDTO.from_model(volume, request, cluster.get_id(), stat_obj)
 
 
 class UpdatableLVolParams(BaseModel):
diff --git a/simplyblock_web/auth_middleware.py b/simplyblock_web/auth_middleware.py
index 8a1a9e83a..87449cb64 100644
--- a/simplyblock_web/auth_middleware.py
+++ b/simplyblock_web/auth_middleware.py
@@ -34,6 +34,10 @@ def decorated(*args: Any, **kwargs: Any) -> ResponseType:
         # Skip authentication for Swagger UI
         if request.method == "GET" and request.path.startswith("/swagger"):
             return cast(ResponseType, f(*args, **kwargs))
+        if request.method == "POST" and request.path.startswith("/cluster/create_first"):
+            return cast(ResponseType, f(*args, **kwargs))
+        if request.method == "GET" and request.path.startswith("/health/fdb"):
+            return cast(ResponseType, f(*args, **kwargs))            
 
         cluster_id: str = ""
         cluster_secret: str = ""
diff --git a/simplyblock_web/node_utils_k8s.py b/simplyblock_web/node_utils_k8s.py
index 4626a89c9..b1440744d 100644
--- a/simplyblock_web/node_utils_k8s.py
+++ b/simplyblock_web/node_utils_k8s.py
@@ -5,6 +5,7 @@
 import time
 
 from simplyblock_core.utils import get_k8s_batch_client
+from kubernetes.client import ApiException
 
 
 node_name = os.environ.get("HOSTNAME")
@@ -23,7 +24,7 @@ def get_namespace():
             return out
     return default_namespace
 
-def wait_for_job_completion(job_name, namespace, timeout=60):
+def wait_for_job_completion(job_name, namespace, timeout=180):
     batch_v1 = get_k8s_batch_client()
     for _ in range(timeout):
         job = batch_v1.read_namespaced_job(job_name, namespace)
@@ -33,3 +34,19 @@ def wait_for_job_completion(job_name, namespace, timeout=60):
             raise RuntimeError(f"Job '{job_name}' failed")
         time.sleep(3)
     raise TimeoutError(f"Timeout waiting for Job '{job_name}' to complete")
+
+def wait_for_job_deletion(job_name, namespace, timeout=60):
+    batch_v1 = get_k8s_batch_client()
+
+    for _ in range(timeout):
+        try:
+            batch_v1.read_namespaced_job(job_name, namespace)
+        except ApiException as e:
+            if e.status == 404:
+                return True
+            else:
+                raise
+
+        time.sleep(2)
+
+    raise TimeoutError(f"Timeout waiting for Job '{job_name}' to be deleted")
diff --git a/simplyblock_web/static/openapi.json b/simplyblock_web/static/openapi.json
new file mode 100644
index 000000000..3e2a05130
--- /dev/null
+++ b/simplyblock_web/static/openapi.json
@@ -0,0 +1 @@
+{"openapi":"3.1.0","info":{"title":"FastAPI","version":"0.1.0"},"paths":{"/api/v2/clusters/":{"get":{"summary":"Clusters:List","operationId":"clusters_list_api_v2_clusters__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/ClusterDTO"},"title":"Response Clusters List Api V2 Clusters  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Create","operationId":"clusters_create_api_v2_clusters__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClusterParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/":{"get":{"summary":"Clusters:Detail","operationId":"clusters_detail_api_v2_clusters__cluster_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ClusterDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Update","operationId":"clusters_update_api_v2_clusters__cluster_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableClusterParameters"}}}},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Delete","operationId":"clusters_delete_api_v2_clusters__cluster_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/capacity":{"get":{"summary":"Clusters:Capacity","operationId":"clusters_capacity_api_v2_clusters__cluster_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/iostats":{"get":{"summary":"Clusters:Iostats","operationId":"clusters_iostats_api_v2_clusters__cluster_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/logs":{"get":{"summary":"Clusters:Logs","operationId":"clusters_logs_api_v2_clusters__cluster_id__logs_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","default":50,"title":"Limit"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/start":{"post":{"summary":"Clusters:Start","operationId":"clusters_start_api_v2_clusters__cluster_id__start_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/shutdown":{"post":{"summary":"Clusters:Shutdown","operationId":"clusters_shutdown_api_v2_clusters__cluster_id__shutdown_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/activate":{"post":{"summary":"Clusters:Activate","operationId":"clusters_activate_api_v2_clusters__cluster_id__activate_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/update":{"post":{"summary":"Clusters:Upgrade","operationId":"clusters_upgrade_api_v2_clusters__cluster_id__update_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_UpdateParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/":{"get":{"summary":"Clusters:Storage-Nodes:List","operationId":"clusters_storage_nodes_list_api_v2_clusters__cluster_id__storage_nodes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/StorageNodeDTO"},"title":"Response Clusters Storage Nodes List Api V2 Clusters  Cluster Id  Storage Nodes  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Nodes:Create","operationId":"clusters_storage_nodes_create_api_v2_clusters__cluster_id__storage_nodes__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/StorageNodeParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/":{"get":{"summary":"Clusters:Storage-Nodes:Detail","operationId":"clusters_storage_nodes_detail_api_v2_clusters__cluster_id__storage_nodes__storage_node_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Nodes:Delete","operationId":"clusters_storage_nodes_delete_api_v2_clusters__cluster_id__storage_nodes__storage_node_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force_remove","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force Remove"}},{"name":"force_migrate","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force Migrate"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/capacity":{"get":{"summary":"Clusters:Storage-Nodes:Capacity","operationId":"clusters_storage_nodes_capacity_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/iostats":{"get":{"summary":"Clusters:Storage-Nodes:Iostats","operationId":"clusters_storage_nodes_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/nics":{"get":{"summary":"Clusters:Storage-Nodes:Nics:List","operationId":"clusters_storage_nodes_nics_list_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__nics_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/nics/{nic_id}/iostats":{"get":{"summary":"Clusters:Storage-Nodes:Nics:Iostats","operationId":"clusters_storage_nodes_nics_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__nics__nic_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"nic_id","in":"path","required":true,"schema":{"type":"string","title":"Nic Id"}},{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/suspend":{"post":{"summary":"Clusters:Storage-Nodes:Suspend","operationId":"clusters_storage_nodes_suspend_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__suspend_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/resume":{"post":{"summary":"Clusters:Storage-Nodes:Resume","operationId":"clusters_storage_nodes_resume_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__resume_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/shutdown":{"post":{"summary":"Clusters:Storage-Nodes:Shutdown","operationId":"clusters_storage_nodes_shutdown_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__shutdown_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"force","in":"query","required":false,"schema":{"type":"boolean","default":false,"title":"Force"}}],"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/restart":{"post":{"summary":"Clusters:Storage-Nodes:Restart","operationId":"clusters_storage_nodes_restart_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__restart_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_RestartParams","default":{"force":false,"reattach_volume":false}}}}},"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/start":{"post":{"summary":"Clusters:Storage-Nodes:Start","operationId":"clusters_storage_nodes_start_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__start_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_RestartParams","default":{"force":false,"reattach_volume":false}}}}},"responses":{"202":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/":{"get":{"summary":"Clusters:Storage Nodes:Devices:List","operationId":"clusters_storage_nodes_devices_list_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/DeviceDTO"},"title":"Response Clusters Storage Nodes Devices List Api V2 Clusters  Cluster Id  Storage Nodes  Storage Node Id  Devices  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/":{"get":{"summary":"Clusters:Storage Nodes:Devices:Detail","operationId":"clusters_storage_nodes_devices_detail_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/DeviceDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage Nodes:Devices:Delete","operationId":"clusters_storage_nodes_devices_delete_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/capacity":{"get":{"summary":"Clusters:Storage Nodes:Devices:Capacity","operationId":"clusters_storage_nodes_devices_capacity_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/iostats":{"get":{"summary":"Clusters:Storage Nodes:Devices:Iostats","operationId":"clusters_storage_nodes_devices_iostats_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-nodes/{storage_node_id}/devices/{device_id}/reset":{"post":{"summary":"Clusters:Storage Nodes:Devices:Reset","operationId":"clusters_storage_nodes_devices_reset_api_v2_clusters__cluster_id__storage_nodes__storage_node_id__devices__device_id__reset_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"storage_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Storage Node Id"}},{"name":"device_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Device Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/":{"get":{"summary":"Clusters:Storage-Pools:List","operationId":"clusters_storage_pools_list_api_v2_clusters__cluster_id__storage_pools__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/StoragePoolDTO"},"title":"Response Clusters Storage Pools List Api V2 Clusters  Cluster Id  Storage Pools  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Create","operationId":"clusters_storage_pools_create_api_v2_clusters__cluster_id__storage_pools__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/StoragePoolParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/":{"get":{"summary":"Clusters:Storage-Pools:Detail","operationId":"clusters_storage_pools_detail_api_v2_clusters__cluster_id__storage_pools__pool_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/StoragePoolDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Delete","operationId":"clusters_storage_pools_delete_api_v2_clusters__cluster_id__storage_pools__pool_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Storage-Pools:Update","operationId":"clusters_storage_pools_update_api_v2_clusters__cluster_id__storage_pools__pool_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableStoragePoolParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/iostats":{"get":{"summary":"Clusters:Storage-Pools:Iostats","operationId":"clusters_storage_pools_iostats_api_v2_clusters__cluster_id__storage_pools__pool_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"limit","in":"query","required":false,"schema":{"type":"integer","default":20,"title":"Limit"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/":{"get":{"summary":"Clusters:Storage-Pools:Volumes:List","operationId":"clusters_storage_pools_volumes_list_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/VolumeDTO"},"title":"Response Clusters Storage Pools Volumes List Api V2 Clusters  Cluster Id  Storage Pools  Pool Id  Volumes  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Volumes:Create","operationId":"clusters_storage_pools_volumes_create_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/RootModel_Union__CreateParams___CloneParams__"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Detail","operationId":"clusters_storage_pools_volumes_detail_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/VolumeDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"put":{"summary":"Clusters:Storage-Pools:Volumes:Update","operationId":"clusters_storage_pools_volumes_update_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___put","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/UpdatableLVolParams"}}}},"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Volumes:Delete","operationId":"clusters_storage_pools_volumes_delete_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/inflate":{"post":{"summary":"Clusters:Storage-Pools:Volumes:Inflate","operationId":"clusters_storage_pools_volumes_inflate_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__inflate_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/connect":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Connect","operationId":"clusters_storage_pools_volumes_connect_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__connect_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/capacity":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Capacity","operationId":"clusters_storage_pools_volumes_capacity_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__capacity_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/iostats":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Iostats","operationId":"clusters_storage_pools_volumes_iostats_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__iostats_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}},{"name":"history","in":"query","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"History"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/volumes/{volume_id}/snapshots":{"get":{"summary":"Clusters:Storage-Pools:Volumes:Snapshots:List","operationId":"clusters_storage_pools_volumes_snapshots_list_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__snapshots_get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/SnapshotDTO"},"title":"Response Clusters Storage Pools Volumes Snapshots List Api V2 Clusters  Cluster Id  Storage Pools  Pool Id  Volumes  Volume Id  Snapshots Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Clusters:Storage-Pools:Volumes:Snapshots:Create","operationId":"clusters_storage_pools_volumes_snapshots_create_api_v2_clusters__cluster_id__storage_pools__pool_id__volumes__volume_id__snapshots_post","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"volume_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Volume Id"}}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/_SnapshotParams"}}}},"responses":{"201":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/snapshots/":{"get":{"summary":"Clusters:Storage-Pools:Snapshots:List","operationId":"clusters_storage_pools_snapshots_list_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/SnapshotDTO"},"title":"Response Clusters Storage Pools Snapshots List Api V2 Clusters  Cluster Id  Storage Pools  Pool Id  Snapshots  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/clusters/{cluster_id}/storage-pools/{pool_id}/snapshots/{snapshot_id}/":{"get":{"summary":"Clusters:Storage-Pools:Snapshots:Detail","operationId":"clusters_storage_pools_snapshots_detail_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__snapshot_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"snapshot_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Snapshot Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/SnapshotDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Clusters:Storage-Pools:Snapshots:Delete","operationId":"clusters_storage_pools_snapshots_delete_api_v2_clusters__cluster_id__storage_pools__pool_id__snapshots__snapshot_id___delete","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}},{"name":"pool_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Pool Id"}},{"name":"snapshot_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Snapshot Id"}}],"responses":{"204":{"description":"Successful Response"},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/management-nodes/":{"get":{"summary":"Management Nodes:List","operationId":"management_nodes_list_api_v2_management_nodes__get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"cluster_id","in":"query","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"type":"array","items":{"$ref":"#/components/schemas/ManagementNodeDTO"},"title":"Response Management Nodes List Api V2 Management Nodes  Get"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/api/v2/management-nodes/{management_node_id}/":{"get":{"summary":"Management Node:Detail","operationId":"management_node_detail_api_v2_management_nodes__management_node_id___get","security":[{"HTTPBearer":[]}],"parameters":[{"name":"management_node_id","in":"path","required":true,"schema":{"type":"string","format":"uuid","title":"Management Node Id"}},{"name":"cluster_id","in":"query","required":true,"schema":{"type":"string","format":"uuid","title":"Cluster Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ManagementNodeDTO"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"ClusterDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"nqn":{"type":"string","title":"Nqn"},"status":{"type":"string","enum":["active","read_only","inactive","suspended","degraded","unready","in_activation","in_expansion"],"title":"Status"},"rebalancing":{"type":"boolean","title":"Rebalancing"},"block_size":{"type":"integer","minimum":0.0,"title":"Block Size"},"coding":{"prefixItems":[{"type":"integer","minimum":0.0},{"type":"integer","minimum":0.0}],"type":"array","maxItems":2,"minItems":2,"title":"Coding"},"ha":{"type":"boolean","title":"Ha"},"utliziation_critical":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Utliziation Critical"},"utilization_warning":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Utilization Warning"},"provisioned_cacacity_critical":{"type":"integer","minimum":0.0,"title":"Provisioned Cacacity Critical"},"provisioned_cacacity_warning":{"type":"integer","minimum":0.0,"title":"Provisioned Cacacity Warning"},"node_affinity":{"type":"boolean","title":"Node Affinity"},"anti_affinity":{"type":"boolean","title":"Anti Affinity"},"secret":{"type":"string","title":"Secret"}},"type":"object","required":["id","name","nqn","status","rebalancing","block_size","coding","ha","utliziation_critical","utilization_warning","provisioned_cacacity_critical","provisioned_cacacity_warning","node_affinity","anti_affinity","secret"],"title":"ClusterDTO"},"ClusterParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"blk_size":{"type":"integer","enum":[512,4096],"title":"Blk Size","default":512},"page_size_in_blocks":{"type":"integer","exclusiveMinimum":0.0,"title":"Page Size In Blocks","default":2097152},"cap_warn":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Cap Warn","default":0},"cap_crit":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Cap Crit","default":0},"prov_cap_warn":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Prov Cap Warn","default":0},"prov_cap_crit":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Prov Cap Crit","default":0},"distr_ndcs":{"type":"integer","title":"Distr Ndcs","default":1},"distr_npcs":{"type":"integer","title":"Distr Npcs","default":1},"distr_bs":{"type":"integer","title":"Distr Bs","default":4096},"distr_chunk_bs":{"type":"integer","title":"Distr Chunk Bs","default":4096},"ha_type":{"type":"string","enum":["single","ha"],"title":"Ha Type","default":"single"},"qpair_count":{"type":"integer","title":"Qpair Count","default":256},"max_queue_size":{"type":"integer","title":"Max Queue Size","default":128},"inflight_io_threshold":{"type":"integer","title":"Inflight Io Threshold","default":4},"enable_node_affinity":{"type":"boolean","title":"Enable Node Affinity","default":false},"strict_node_anti_affinity":{"type":"boolean","title":"Strict Node Anti Affinity","default":false}},"type":"object","title":"ClusterParams"},"DeviceDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"size":{"type":"integer","title":"Size"},"io_error":{"type":"boolean","title":"Io Error"},"is_partition":{"type":"boolean","title":"Is Partition"},"nvmf_ips":{"items":{"type":"string","format":"ipv4"},"type":"array","title":"Nvmf Ips"},"nvmf_nqn":{"type":"string","title":"Nvmf Nqn","default":""},"nvmf_port":{"type":"integer","title":"Nvmf Port","default":0}},"type":"object","required":["id","status","health_check","size","io_error","is_partition","nvmf_ips"],"title":"DeviceDTO"},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ManagementNodeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"hostname":{"type":"string","title":"Hostname"},"ip":{"type":"string","format":"ipv4","title":"Ip"}},"type":"object","required":["id","status","hostname","ip"],"title":"ManagementNodeDTO"},"RootModel_Union__CreateParams___CloneParams__":{"anyOf":[{"$ref":"#/components/schemas/_CreateParams"},{"$ref":"#/components/schemas/_CloneParams"}],"title":"RootModel[Union[_CreateParams, _CloneParams]]"},"SnapshotDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"used_size":{"type":"integer","minimum":0.0,"title":"Used Size"},"lvol":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Lvol"}},"type":"object","required":["id","name","status","health_check","size","used_size","lvol"],"title":"SnapshotDTO"},"StorageNodeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"status":{"type":"string","title":"Status"},"ip":{"type":"string","format":"ipv4","title":"Ip"}},"type":"object","required":["id","status","ip"],"title":"StorageNodeDTO"},"StorageNodeParams":{"properties":{"node_address":{"type":"string","title":"Node Address","default":"^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$"},"interface_name":{"type":"string","title":"Interface Name"},"max_snapshots":{"type":"integer","title":"Max Snapshots","default":500},"ha_jm":{"type":"boolean","title":"Ha Jm","default":true},"test_device":{"type":"boolean","title":"Test Device","default":false},"spdk_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Spdk Image"},"spdk_debug":{"type":"boolean","title":"Spdk Debug","default":false},"full_page_unmap":{"type":"boolean","title":"Full Page Unmap","default":false},"data_nics":{"items":{"type":"string"},"type":"array","title":"Data Nics","default":[]},"namespace":{"type":"string","title":"Namespace","default":"default"},"jm_percent":{"type":"integer","maximum":100.0,"minimum":0.0,"title":"Jm Percent","default":3},"partitions":{"type":"integer","title":"Partitions","default":1},"iobuf_small_pool_count":{"type":"integer","title":"Iobuf Small Pool Count","default":0},"iobuf_large_pool_count":{"type":"integer","title":"Iobuf Large Pool Count","default":0},"ha_jm_count":{"type":"integer","title":"Ha Jm Count","default":3}},"type":"object","required":["interface_name","spdk_image"],"title":"StorageNodeParams"},"StoragePoolDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","enum":["active","inactive"],"title":"Status"},"max_size":{"type":"integer","minimum":0.0,"title":"Max Size"},"volume_max_size":{"type":"integer","minimum":0.0,"title":"Volume Max Size"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops"},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes"},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes"},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes"}},"type":"object","required":["id","name","status","max_size","volume_max_size","max_rw_iops","max_rw_mbytes","max_r_mbytes","max_w_mbytes"],"title":"StoragePoolDTO"},"StoragePoolParams":{"properties":{"name":{"type":"string","title":"Name"},"pool_max":{"type":"integer","minimum":0.0,"title":"Pool Max","default":0},"volume_max_size":{"type":"integer","minimum":0.0,"title":"Volume Max Size","default":0},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0}},"type":"object","required":["name"],"title":"StoragePoolParams"},"UpdatableClusterParameters":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"}},"type":"object","title":"UpdatableClusterParameters"},"UpdatableLVolParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0},"size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Size"}},"type":"object","title":"UpdatableLVolParams"},"UpdatableStoragePoolParams":{"properties":{"name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Name"},"max_size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Size"},"volume_max_size":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Volume Max Size"},"max_rw_iops":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Rw Iops"},"max_rw_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max Rw Mbytes"},"max_r_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max R Mbytes"},"max_w_mbytes":{"anyOf":[{"type":"integer","minimum":0.0},{"type":"null"}],"title":"Max W Mbytes"}},"type":"object","title":"UpdatableStoragePoolParams"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"VolumeDTO":{"properties":{"id":{"type":"string","format":"uuid","title":"Id"},"name":{"type":"string","title":"Name"},"status":{"type":"string","title":"Status"},"health_check":{"type":"boolean","title":"Health Check"},"nqn":{"type":"string","title":"Nqn"},"nodes":{"items":{"type":"string"},"type":"array","title":"Nodes"},"port":{"type":"integer","exclusiveMaximum":65536.0,"minimum":0.0,"title":"Port"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"cloned_from":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Cloned From"},"crypto_key":{"anyOf":[{"prefixItems":[{"type":"string"},{"type":"string"}],"type":"array","maxItems":2,"minItems":2},{"type":"null"}],"title":"Crypto Key"},"high_availability":{"type":"boolean","title":"High Availability"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops"},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes"},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes"},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes"}},"type":"object","required":["id","name","status","health_check","nqn","nodes","port","size","cloned_from","crypto_key","high_availability","max_rw_iops","max_rw_mbytes","max_r_mbytes","max_w_mbytes"],"title":"VolumeDTO"},"_CloneParams":{"properties":{"name":{"type":"string","title":"Name"},"snapshot_id":{"anyOf":[{"type":"string","pattern":"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"},{"type":"null"}],"title":"Snapshot Id"},"size":{"type":"integer","minimum":0.0,"title":"Size","default":0}},"type":"object","required":["name","snapshot_id"],"title":"_CloneParams"},"_CreateParams":{"properties":{"name":{"type":"string","title":"Name"},"size":{"type":"integer","minimum":0.0,"title":"Size"},"crypto_key":{"anyOf":[{"prefixItems":[{"type":"string"},{"type":"string"}],"type":"array","maxItems":2,"minItems":2},{"type":"null"}],"title":"Crypto Key"},"max_rw_iops":{"type":"integer","minimum":0.0,"title":"Max Rw Iops","default":0},"max_rw_mbytes":{"type":"integer","minimum":0.0,"title":"Max Rw Mbytes","default":0},"max_r_mbytes":{"type":"integer","minimum":0.0,"title":"Max R Mbytes","default":0},"max_w_mbytes":{"type":"integer","minimum":0.0,"title":"Max W Mbytes","default":0},"ha_type":{"anyOf":[{"type":"string","enum":["single","ha"]},{"type":"null"}],"title":"Ha Type"},"host_id":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Host Id"},"priority_class":{"type":"integer","enum":[0,1],"title":"Priority Class","default":0},"namespace":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Namespace"},"pvc_name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Pvc Name"},"ndcs":{"type":"integer","minimum":0.0,"title":"Ndcs","default":0},"npcs":{"type":"integer","minimum":0.0,"title":"Npcs","default":0}},"type":"object","required":["name","size"],"title":"_CreateParams"},"_RestartParams":{"properties":{"force":{"type":"boolean","title":"Force","default":false},"reattach_volume":{"type":"boolean","title":"Reattach Volume","default":false}},"type":"object","title":"_RestartParams"},"_SnapshotParams":{"properties":{"name":{"type":"string","title":"Name"}},"type":"object","required":["name"],"title":"_SnapshotParams"},"_UpdateParams":{"properties":{"management_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Management Image"},"spdk_image":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Spdk Image"},"restart":{"type":"boolean","title":"Restart","default":false}},"type":"object","required":["management_image","spdk_image"],"title":"_UpdateParams"}},"securitySchemes":{"HTTPBearer":{"type":"http","scheme":"bearer"}}}}
\ No newline at end of file
diff --git a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2 b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2
index 734d9c59e..74f66721d 100644
--- a/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2
+++ b/simplyblock_web/templates/oc_storage_core_isolation.yaml.j2
@@ -34,9 +34,18 @@ spec:
             - |
               set -e
 
+              MARKER="/var/simplyblock/.cpu_isolation_applied"
+
               echo "--- Installing jq ---"
               apk add --no-cache jq
 
+              echo "--- Checking if node was already configured ---"
+
+              if [[ -f "$MARKER" ]]; then
+                  echo "[INFO] Node already configured. Skipping sleep and exiting..."
+                  exit 0
+              fi
+
               echo "--- Reading isolated cores from config ---"
               CONFIG_FILE="/var/simplyblock/sn_config_file"
 
@@ -105,4 +114,8 @@ spec:
 
               echo "[INFO] Init setup and CPU isolation complete."
               
-              echo "--- Init setup complete ---"
+              echo "[INFO] Marking node as configured."
+              touch "$MARKER"
+
+              echo "[INFO] Node is rebooting. Sleeping indefinitely to stop pipeline..."
+              sleep infinity
diff --git a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2 b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
index f10478c75..81f1e1eda 100644
--- a/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
+++ b/simplyblock_web/templates/storage_deploy_spdk.yaml.j2
@@ -1,7 +1,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  name: snode-spdk-pod-{{ RPC_PORT }}
+  name: snode-spdk-pod-{{ RPC_PORT }}-{{ CLUSTER_ID }}
   namespace: {{ NAMESPACE }}
   labels:
     app: spdk-app-{{ RPC_PORT }}
@@ -87,16 +87,6 @@ spec:
           value: "{{ TOTAL_HP }}"
         - name: RPC_PORT
           value: "{{ RPC_PORT }}"
-        - name: SPDKCSI_SECRET
-          valueFrom:
-            secretKeyRef:
-              name: simplyblock-csi-secret
-              key: secret.json
-        - name: CLUSTER_CONFIG
-          valueFrom:
-            configMapKeyRef:
-              name: simplyblock-csi-cm
-              key: config.json
       lifecycle:
         postStart:
           exec:
diff --git a/simplyblock_web/utils.py b/simplyblock_web/utils.py
index b0d1795df..a610cd177 100644
--- a/simplyblock_web/utils.py
+++ b/simplyblock_web/utils.py
@@ -149,6 +149,7 @@ def error_handler(exception: Exception):
 
 class RPCPortParams(BaseModel):
     rpc_port: int = Field(constants.RPC_HTTP_PROXY_PORT, ge=0, le=65536)
+    cluster_id: Optional[str]
 
 
 class DeviceParams(BaseModel):

From 6cd0dd63c6084aef4869e5aa5d0d83a5e1f1f2b1 Mon Sep 17 00:00:00 2001
From: Alexander Sheredin <alexander@simplyblock.io>
Date: Wed, 21 Jan 2026 03:00:52 +0300
Subject: [PATCH 68/68] add lvol migration task runner

---
 .../controllers/lvol_migration_controller.py  | 693 ++++++++++--------
 .../controllers/tasks_controller.py           |  39 +
 simplyblock_core/db_controller.py             |   6 +
 simplyblock_core/models/job_schedule.py       |   1 +
 .../services/tasks_runner_lvol_migration.py   | 111 +++
 simplyblock_core/storage_node_ops.py          |  12 -
 6 files changed, 540 insertions(+), 322 deletions(-)
 create mode 100644 simplyblock_core/services/tasks_runner_lvol_migration.py

diff --git a/simplyblock_core/controllers/lvol_migration_controller.py b/simplyblock_core/controllers/lvol_migration_controller.py
index cc0c100d3..ec810434a 100644
--- a/simplyblock_core/controllers/lvol_migration_controller.py
+++ b/simplyblock_core/controllers/lvol_migration_controller.py
@@ -2,17 +2,15 @@
 from logging import exception
 from time import sleep
 
-from jc.parsers.asn1crypto.core import Boolean
-
-from ..cluster_ops import db_controller
-from ..models.lvol_migration import *
+from simplyblock_core.cluster_ops import db_controller
 from dataclasses import dataclass
-from typing import Optional, Dict
+from typing import Optional
 from simplyblock_core.storage_node_ops import *
 from simplyblock_core.db_controller import *
 from simplyblock_core.models.lvol_model import LVol
 from simplyblock_core.models.storage_node import StorageNode
-from simplyblock_core.models.lvol_migration import Snapshot
+from simplyblock_core.models.lvol_migration import Snapshot, \
+    ObjectMigrationState, LogicalVolumeRef, MigrationState
 from simplyblock_core.models.snapshot import SnapShot
 from datetime import datetime
 
@@ -22,8 +20,7 @@
 import copy
 
 
-
-#TODOS: Integrate in Task Mgmt
+# TODOS: Integrate in Task Mgmt
 #       Asynchronous delete of objects must check results before sync delete and cleanup is ready
 #       must reconnect rpc clients after node restart
 #       double-check all object states
@@ -38,6 +35,7 @@ def generate_nqn():
     nqn = f"nqn.2024-01.io.simplyblock:tmp:{random_uuid}"
     return nqn
 
+
 class MigrationQueueObjectType:
     SNAPSHOT = "snapshot"
     CLONE = "clone"
@@ -68,7 +66,6 @@ def reset(self):
         self.objects.clear()
 
 
-
 class MigrationService:
     """Service containing core migration logic."""
 
@@ -76,14 +73,13 @@ class MigrationService:
     RETRY_DELAY = 5  # seconds, can be increased exponentially
 
 
-
 # ---------------------------------------------------------------------------
 # Migration Controller
 # ---------------------------------------------------------------------------
 
 class MigrationController:
     """Controller orchestrates LVOL migrations."""
-
+    migrate_lock = threading.Lock()
 
     def __init__(self):
         self._stop_event = threading.Event()
@@ -92,16 +88,17 @@ def __init__(self):
         self.db_controller = DBController()
         self.prev_time = datetime.now()
 
-    #connect clients for both primary (source) and secondary (target) nodes
+
+    # connect clients for both primary (source) and secondary (target) nodes
     def connect_clients(self):
-      try:
-        self.m.rpc_client1 = self.connect_client(self.m.node_pri)
-        self.m.rpc_client2 = self.connect_client(self.m.node_sec)
-        self.m.rpc_client3 = self.connect_client(self.m.target_node_pri)
-        self.m.rpc_client4 = self.connect_client(self.m.target_node_sec)
-      except:
-        raise f"migration {self.m.uuid}: cannot create rpc client for all nodes. all nodes online?"
-      return
+        try:
+            self.m.rpc_client1 = self.connect_client(self.m.node_pri)
+            self.m.rpc_client2 = self.connect_client(self.m.node_sec)
+            self.m.rpc_client3 = self.connect_client(self.m.target_node_pri)
+            self.m.rpc_client4 = self.connect_client(self.m.target_node_sec)
+        except:
+            raise f"migration {self.m.uuid}: cannot create rpc client for all nodes. all nodes online?"
+        return
 
     def get_rpc_client(self, node: StorageNode):
         if node.uuid == self.m.node_pri.uuid:
@@ -113,15 +110,17 @@ def get_rpc_client(self, node: StorageNode):
         elif node.uuid == self.m.target_node_sec.uuid:
             client = self.m.rpc_client4
         else:
-            raise RuntimeError(f"migration {self.m.uuid}: invalid node {node.uuid}, stopping. ")
+            raise RuntimeError(
+                f"migration {self.m.uuid}: invalid node {node.uuid}, stopping. ")
         if not client or node.status != StorageNode.STATUS_ONLINE:
-            raise RuntimeError(f"migration {self.m.uuid}: node {node.uuid} not online, stopping. ")
+            raise RuntimeError(
+                f"migration {self.m.uuid}: node {node.uuid} not online, stopping. ")
         return client
 
     def snap_assign(self, lvol: LogicalVolumeRef, snap: SnapShot):
         s = Snapshot()
-        s.lvol=lvol
-        s.snap=snap
+        s.lvol = lvol
+        s.snap = snap
         return s
 
     def lvol_assign(self, lvol: LVol):
@@ -131,11 +130,11 @@ def lvol_assign(self, lvol: LVol):
 
     def check_nodes_online(self):
         if self.m.node_pri.status == StorageNode.STATUS_ONLINE and self.m.node_sec.status == StorageNode.STATUS_ONLINE and self.m.target_node_pri.status == StorageNode.STATUS_ONLINE and self.m.target_node_sec.status == StorageNode.STATUS_ONLINE:
-               return True
+            return True
         return False
 
     def raise_exception_on_error(self, ret: dict, err_str: str):
-        error="object not found"
+        error = "object not found"
         if not ret or "error" in ret:
             if ret:
                 error = f"{ret['error']['message']}:{ret['error']['code']}"
@@ -146,33 +145,38 @@ def raise_exception_on_error(self, ret: dict, err_str: str):
     def get_transfer_state(self, node: StorageNode, counter: int):
         client = self.get_rpc_client(node)
         for m in self.m.completion_poll_queue:
-            if m.status==ObjectMigrationState.TRANSFER:
-              try:
-                 name=m.snap.lvol.lvs_name+"/"+m.snap.snap_bdev
-                 ret = client.bdev_lvol_transfer_stat(name)
-                 self.raise_exception_on_error(ret, f"could not get transfer state for lvol: {name}")
-                 if ret["transfer_state"]=="Done":
-                        m.status=ObjectMigrationState.TRANSFERRED
+            if m.status == ObjectMigrationState.TRANSFER:
+                try:
+                    name = m.snap.lvol.lvs_name + "/" + m.snap.snap_bdev
+                    ret = client.bdev_lvol_transfer_stat(name)
+                    self.raise_exception_on_error(ret,
+                                                  f"could not get transfer state for lvol: {name}")
+                    if ret["transfer_state"] == "Done":
+                        m.status = ObjectMigrationState.TRANSFERRED
                         self.m.write_to_db(db_controller.kv_store)
                         self.m.completion_poll_queue.remove(m)
                         return True, 0
-                 else:
-                     return False, ret["offset"]
-              except:
-                  logger.error(f"could not get transfer state for lvol")
-                  return False, 0
+                    else:
+                        return False, ret["offset"]
+                except:
+                    logger.error(f"could not get transfer state for lvol")
+                    return False, 0
         return False, 0
 
     def create_snapshot(self, node: StorageNode, index: int):
         client = self.get_rpc_client(node)
-        ret=client.lvol_exists(node.lvstore,"mig_snap_"+str(index)+"_"+self.m.vol.lvol.lvol_name)
+        ret = client.lvol_exists(node.lvstore, "mig_snap_" + str(
+            index) + "_" + self.m.vol.lvol.lvol_name)
         if not ret or "error" in ret:
-            ret=client.lvol_create_snapshot(self.m.vol.lvol.lvol_uuid, "mig_snap_"+str(index)+"_"+self.m.vol.lvol.lvol_name)
-            self.raise_exception_on_error(ret, f"could not create snapshot for lvol: {self.m.vol.lvol.uuid}")
+            ret = client.lvol_create_snapshot(self.m.vol.lvol.lvol_uuid,
+                                              "mig_snap_" + str(
+                                                  index) + "_" + self.m.vol.lvol.lvol_name)
+            self.raise_exception_on_error(ret,
+                                          f"could not create snapshot for lvol: {self.m.vol.lvol.uuid}")
         for sn in self.m.snapshots:
-            if sn.snap.uuid==ret["result"]:
+            if sn.snap.uuid == ret["result"]:
                 return True
-        s=self.snap_assign(self.m.vol,ret["result"])
+        s = self.snap_assign(self.m.vol, ret["result"])
         self.m.snapshots.append(s)
         return True
 
@@ -193,8 +197,9 @@ def migrations_list(self):
         return utils.print_table(data)
 
     @staticmethod
-    def connect_client(node:StorageNode):
-        return RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username, node.rpc_password, timeout=3, retry=1)
+    def connect_client(node: StorageNode):
+        return RPCClient(node.mgmt_ip, node.rpc_port, node.rpc_username,
+                         node.rpc_password, timeout=3, retry=1)
 
     def unfreeze_objects(self):
         self.db_controller = DBController()
@@ -212,172 +217,212 @@ def unfreeze_objects(self):
         return True
 
     def complete_snapshot_migration(self):
-        tr=db_controller.kv_store.create_transaction()
-        #snapshot objects are always create new, while lvols are really migrated
+        tr = db_controller.kv_store.create_transaction()
+        # snapshot objects are always create new, while lvols are really migrated
         for s in self.m.snapshots:
-            if s.status==ObjectMigrationState.DONE:
+            if s.status == ObjectMigrationState.DONE:
                 snapshot = copy.copy(s.snap)
                 snapshot.uuid = str(uuid.uuid4())
                 snapshot.snap_uuid = s.target_uuid
-                snapshot.node_id=self.m.node_pri.get_id()
-                snapshot.write_to_db(db_controller.kv_store,tr)
+                snapshot.node_id = self.m.node_pri.get_id()
+                snapshot.write_to_db(db_controller.kv_store, tr)
 
         lvol = copy.copy(self.m.vol.lvol)
-        lvol.node_id=self.m.node_pri.get_id()
-        lvol.lvol_bdev=self.m.vol.lvol.lvol_bdev
-        lvol.blobid=self.m.vol.lvol.blobid
-        lvol.lvol_uuid=self.m.vol.lvol.lvol_uuid
-        lvol.lvs_name=self.m.vol.lvol.lvs_name
-        lvol.write_to_db(db_controller.kv_store,tr)
+        lvol.node_id = self.m.node_pri.get_id()
+        lvol.lvol_bdev = self.m.vol.lvol.lvol_bdev
+        lvol.blobid = self.m.vol.lvol.blobid
+        lvol.lvol_uuid = self.m.vol.lvol.lvol_uuid
+        lvol.lvs_name = self.m.vol.lvol.lvs_name
+        lvol.write_to_db(db_controller.kv_store, tr)
         try:
-          tr.commit.wait()
+            tr.commit.wait()
         except:
-          raise RuntimeError(f"migration {self.m.uuid}: error updating snapshots and volumes in db.")
+            raise RuntimeError(
+                f"migration {self.m.uuid}: error updating snapshots and volumes in db.")
         return True
 
     def create_lvol(self, node: StorageNode, snap: Snapshot):
-            client = self.get_rpc_client(node)
-            name = node.lvstore + "/" + snap.snap.snap_bdev
-            snap_uuid = client.lvol_exists(node.lvstore,node.lvstore+"/"+snap.snap.snap_bdev)
-            if not snap_uuid or "error" in snap_uuid:
-                     snap_uuid = client.create_lvol(name, snap.snap.size, self.m.target_node_pri.lvstore,
-                                                         self.m.vol.lvol.lvol_priority_class,
-                                                         self.m.vol.lvol.ndcs,
-                                                         self.m.vol.lvol.npcs)
-                     self.raise_exception_on_error(snap_uuid,f"could not create lvol on target: {snap.snap.uuid}")
-            snap.target_uuid = snap_uuid["result"]
-            return True
+        client = self.get_rpc_client(node)
+        name = node.lvstore + "/" + snap.snap.snap_bdev
+        snap_uuid = client.lvol_exists(node.lvstore,
+                                       node.lvstore + "/" + snap.snap.snap_bdev)
+        if not snap_uuid or "error" in snap_uuid:
+            snap_uuid = client.create_lvol(name, snap.snap.size,
+                                           self.m.target_node_pri.lvstore,
+                                           self.m.vol.lvol.lvol_priority_class,
+                                           self.m.vol.lvol.ndcs,
+                                           self.m.vol.lvol.npcs)
+            self.raise_exception_on_error(snap_uuid,
+                                          f"could not create lvol on target: {snap.snap.uuid}")
+        snap.target_uuid = snap_uuid["result"]
+        return True
 
     def set_mig_status(self, node: StorageNode, snap: Snapshot):
-            client = self.get_rpc_client(node)
-            name = self.m.target_node_pri.lvstore + "/" + snap.snap.snap_bdev
-            ret=client.bdev_lvol_set_migration_flag(name)
-            self.raise_exception_on_error(ret, f"issue creating an target object during migration of snapshot {snap.uuid}")
-            snap.status = ObjectMigrationState.MIG_FLAG_SET
-            self.m.write_to_db(self.db_controller.kv_store)
-            return True
+        client = self.get_rpc_client(node)
+        name = self.m.target_node_pri.lvstore + "/" + snap.snap.snap_bdev
+        ret = client.bdev_lvol_set_migration_flag(name)
+        self.raise_exception_on_error(ret,
+                                      f"issue creating an target object during migration of snapshot {snap.uuid}")
+        snap.status = ObjectMigrationState.MIG_FLAG_SET
+        self.m.write_to_db(self.db_controller.kv_store)
+        return True
 
-    def export_lvol(self, node: StorageNode, nqn: str, s: Snapshot, anaState: str, namespaces: int, serial: str, model: str):
+    def export_lvol(self, node: StorageNode, nqn: str, s: Snapshot,
+                    anaState: str, namespaces: int, serial: str, model: str):
         client = self.get_rpc_client(node)
-        #check if subsystem exists, namespace is added and listener exists
-        #nqn=generate_nqn()
-        ss,listener,ns=client.find_subsystem_by_nqn(nqn)
+        # check if subsystem exists, namespace is added and listener exists
+        # nqn=generate_nqn()
+        ss, listener, ns = client.find_subsystem_by_nqn(nqn)
         if not ss:
-             ret=client.subsystem_create(nqn,serial, model, 1, namespaces)
-             self.raise_exception_on_error(ret, f"could not list subsystem for lvol: {s.snap.uuid}")
+            ret = client.subsystem_create(nqn, serial, model, 1, namespaces)
+            self.raise_exception_on_error(ret,
+                                          f"could not list subsystem for lvol: {s.snap.uuid}")
         if not ns:
-             ret=client.nvmf_subsystem_add_ns(s.temporary_nqn,s.snap.lvol.lvs_name+"/"+s.snap.snap_bdev)
-             self.raise_exception_on_error(ret,f"could not list subsystem for lvol: {s.snap.uuid} ")
+            ret = client.nvmf_subsystem_add_ns(s.temporary_nqn,
+                                               s.snap.lvol.lvs_name + "/" + s.snap.snap_bdev)
+            self.raise_exception_on_error(ret,
+                                          f"could not list subsystem for lvol: {s.snap.uuid} ")
         if not listener:
             if self.m.target_node_pri.active_rdma:
-               fabric="RDMA"
+                fabric = "RDMA"
             else:
-               fabric="TCP"
-            ret=client.nvmf_subsystem_add_listener(s.temporary_nqn, fabric,self.m.target_node_pri.nvmf_port,
-                    self.m.target_node_pri.hostname, anaState)
-            self.raise_exception_on_error(ret, f"could not list subsystem for lvol: {s.snap.uuid}")
+                fabric = "TCP"
+            ret = client.nvmf_subsystem_add_listener(s.temporary_nqn, fabric,
+                                                     self.m.target_node_pri.nvmf_port,
+                                                     self.m.target_node_pri.hostname,
+                                                     anaState)
+            self.raise_exception_on_error(ret,
+                                          f"could not list subsystem for lvol: {s.snap.uuid}")
         return True
 
-    #delete subystem only, if there is only zero or one namespaces left;
-    #if one namespace is left, it must match the volume
-    def delete_subsystem(self, node: StorageNode, nqn:str, lvol: LVol):
-        client=self.get_rpc_client(node)
-        data=client.subsystem_list(nqn)
+    # delete subystem only, if there is only zero or one namespaces left;
+    # if one namespace is left, it must match the volume
+    def delete_subsystem(self, node: StorageNode, nqn: str, lvol: LVol):
+        client = self.get_rpc_client(node)
+        data = client.subsystem_list(nqn)
         if not data:
             return False
         ret = None
         for subsystem in data['result']:
             # Check if the subsystem has namespaces
             namespaces = subsystem.get('namespaces', None)
-            if not namespaces or len(namespaces<2):
-                   ret=client.subsystem_delete(nqn)
-                   self.raise_exception_on_error(data, f"could not delete subsystem: {nqn} for lvol: {lvol.uuid}")
-            elif len(namespaces>1):
-                client.nvmf_subsystem_remove_ns(nqn,lvol.namespace)
+            if not namespaces or len(namespaces < 2):
+                ret = client.subsystem_delete(nqn)
+                self.raise_exception_on_error(data,
+                                              f"could not delete subsystem: {nqn} for lvol: {lvol.uuid}")
+            elif len(namespaces > 1):
+                client.nvmf_subsystem_remove_ns(nqn, lvol.namespace)
         return True
 
     def connect_lvol(self, node: StorageNode, s: Snapshot):
         client = self.get_rpc_client(node)
         if node.active_rdma:
-            transport="RDMA"
+            transport = "RDMA"
         else:
-            transport="TCP"
-        ret=client.nvmf_get_subsystems()
+            transport = "TCP"
+        ret = client.nvmf_get_subsystems()
         subsystem = None
         if ret and not "error" in ret:
-            subsystem = next((s for s in ret["result"] if s["nqn"] == s.temporary_nqn), None)
-        attach=True
+            subsystem = next(
+                (s for s in ret["result"] if s["nqn"] == s.temporary_nqn), None)
+        attach = True
         if subsystem:
-            attach=False
-            first_namespace_name = subsystem.get("namespaces", [{}])[0].get("name")
+            attach = False
+            first_namespace_name = subsystem.get("namespaces", [{}])[0].get(
+                "name")
             if first_namespace_name == None:
                 client.bdev_nvme_detach_controller(s.snap.snap_bdev)
-                self.raise_exception_on_error(ret, f"could not remove remote controller: {s.snap.uuid}")
-            attach=True
+                self.raise_exception_on_error(ret,
+                                              f"could not remove remote controller: {s.snap.uuid}")
+            attach = True
         if attach:
-           ret = client.bdev_nvme_attach_controller(s.snap.snap_bdev,s.temporary_nqn,node.hostname,node.nvmf_port,transport)
-           self.raise_exception_on_error(ret, f"could not connect lvol: {s.snap.uuid}")
-           s.controller = ret[0]
+            ret = client.bdev_nvme_attach_controller(s.snap.snap_bdev,
+                                                     s.temporary_nqn,
+                                                     node.hostname,
+                                                     node.nvmf_port, transport)
+            self.raise_exception_on_error(ret,
+                                          f"could not connect lvol: {s.snap.uuid}")
+            s.controller = ret[0]
         return True
 
-    def delete_lvol_from_node(self, node: StorageNode, oid: str, deleteType: bool):
-        client=self.get_rpc_client(node)
-        lvol=db_controller.get_lvol_by_id(oid)
+    def delete_lvol_from_node(self, node: StorageNode, oid: str,
+                              deleteType: bool):
+        client = self.get_rpc_client(node)
+        lvol = db_controller.get_lvol_by_id(oid)
         if lvol:
-           ret=client.delete_lvol(lvol.lvs_name+"/"+lvol.lvol_name, deleteType)
+            ret = client.delete_lvol(lvol.lvs_name + "/" + lvol.lvol_name,
+                                     deleteType)
         else:
-           snap=db_controller.get_snapshot_by_id(oid)
-           ret=client.delete_lvol(snap.lvol.lvs_name + "/" + snap.lvol.lvol_name, deleteType)
-        self.raise_exception_on_error(ret, f"could not delete snapshot/lvol: {oid} ")
+            snap = db_controller.get_snapshot_by_id(oid)
+            ret = client.delete_lvol(
+                snap.lvol.lvs_name + "/" + snap.lvol.lvol_name, deleteType)
+        self.raise_exception_on_error(ret,
+                                      f"could not delete snapshot/lvol: {oid} ")
         return
 
     def transfer_data(self, node: StorageNode, snap: Snapshot, offset: int):
         try:
-          client = self.get_rpc_client(node)
-          ret=client.bdev_lvol_transfer(snap.snap.lvol.lvs_name+"/"+snap.snap.snap_bdev,offset,4,snap.controller, "migrate")
-          self.raise_exception_on_error(ret, f"could not transfer data: {snap.snap.uuid} ")
+            client = self.get_rpc_client(node)
+            ret = client.bdev_lvol_transfer(
+                snap.snap.lvol.lvs_name + "/" + snap.snap.snap_bdev, offset, 4,
+                snap.controller, "migrate")
+            self.raise_exception_on_error(ret,
+                                          f"could not transfer data: {snap.snap.uuid} ")
         except Exception as e:
             logger.error(e)
             return False
         return True
 
     def convert_lvol(self, s: Snapshot):
-        client=self.get_rpc_client(self.m.target_node_pri)
-        ret=client.bdev_lvol_convert(s.snap.lvol.lvs_name+"/"+s.snap.snap_bdev)
+        client = self.get_rpc_client(self.m.target_node_pri)
+        ret = client.bdev_lvol_convert(
+            s.snap.lvol.lvs_name + "/" + s.snap.snap_bdev)
         if ret and "exists" in ret:
             return True
-        self.raise_exception_on_error(ret, f"could not convert lvol to snapshot: {s.snap.uuid} to remote subsystem:")
+        self.raise_exception_on_error(ret,
+                                      f"could not convert lvol to snapshot: {s.snap.uuid} to remote subsystem:")
         return True
 
     def time_difference(self):
-        return (datetime.now()-self.prev_time).total_seconds()
+        return (datetime.now() - self.prev_time).total_seconds()
 
     def create_target_lvol(self, s: Snapshot):
         client = self.get_rpc_client(self.m.target_node_pri)
-        ret=client.create_lvol(s.snap.snap_bdev,s.snap.size,self.m.target_node_pri.lvstore,self.m.vol.lvol.lvol_priority_class,self.m.vol.lvol.ndcs,self.m.vol.lvol.npcs)
-        self.raise_exception_on_error(ret, f"could not create target lvol for snapshot:{s.snap.uuid}")
+        ret = client.create_lvol(s.snap.snap_bdev, s.snap.size,
+                                 self.m.target_node_pri.lvstore,
+                                 self.m.vol.lvol.lvol_priority_class,
+                                 self.m.vol.lvol.ndcs, self.m.vol.lvol.npcs)
+        self.raise_exception_on_error(ret,
+                                      f"could not create target lvol for snapshot:{s.snap.uuid}")
         return True
 
     def create_target_lvol2(self, node: StorageNode, l: LogicalVolumeRef):
         client = self.get_rpc_client(node)
         if l.lvol.crypto_bdev != "":
-               client.lvol_crypto_create(l.lvol.crypto_bdev,l.lvol.lvol_bdev,l.lvol.crypto_key_name)
-        ret = client.create_lvol(l.lvol.lvol_bdev, l.lvol.size, node.lvstore, l.lvol.lvol_priority_class, l.lvol.ndcs, l.lvol.npcs)
-        ret=client.create_lvol(l.lvol.lvol_bdev,l.lvol.size,node.lvstore,l.lvol.lvol_priority_class,l.lvol.ndcs,l.lvol.npcs)
-        self.raise_exception_on_error(ret, f"could not create target lvol for main lvol:{l.lvol.uuid}")
+            client.lvol_crypto_create(l.lvol.crypto_bdev, l.lvol.lvol_bdev,
+                                      l.lvol.crypto_key_name)
+        ret = client.create_lvol(l.lvol.lvol_bdev, l.lvol.size, node.lvstore,
+                                 l.lvol.lvol_priority_class, l.lvol.ndcs,
+                                 l.lvol.npcs)
+        self.raise_exception_on_error(ret,
+                                      f"could not create target lvol for main lvol:{l.lvol.uuid}")
         return True
 
     def connect_hublvol(self, node: StorageNode):
         client = self.get_rpc_client(node)
         if node.active_rdma:
-            fabric="RDMA"
+            fabric = "RDMA"
         else:
-            fabric="TCP"
+            fabric = "TCP"
 
-        ret=client.bdev_nvme_controller_list("migratelvol")
+        ret = client.bdev_nvme_controller_list("migratelvol")
         if not ret:
-           ret=client.bdev_nvme_attach_controller("migratelvol",node.hublvol,node.hostname,node.nvmf_port,fabric)
-           self.raise_exception_on_error(ret, f"could not attach controller for {self.m.vol.lvol.uuid} for hublvol")
+            ret = client.bdev_nvme_attach_controller("migratelvol",
+                                                     node.hublvol,
+                                                     node.hostname,
+                                                     node.nvmf_port, fabric)
+            self.raise_exception_on_error(ret,
+                                          f"could not attach controller for {self.m.vol.lvol.uuid} for hublvol")
 
         return True
 
@@ -385,24 +430,32 @@ def transfer_data_final(self):
         client1 = self.get_rpc_client(self.m.node_pri)
         client2 = self.get_rpc_client(self.m.target_node_sec)
         client3 = self.get_rpc_client(self.m.target_node_pri)
-        uuid, map_id = client3.lvol_exists(self.m.target_node_pri,self.m.vol)
+        uuid, map_id = client3.lvol_exists(self.m.target_node_pri, self.m.vol)
         if not uuid:
-             self.create_target_lvol2(self.m.target_node_pri,self.m.vol)
-             uuid1, _ = client2.lvol_exists(self.m.target_node_sec, self.m.vol)
-             if not uuid1:
-                ret=client2.bdev_lvol_register(self.m.vol.lvol.lvol_bdev,self.m.target_node_sec.lvstore, self.m.vol.lvol.blobid, self.m.vol.lvol.lvol_uuid)
-                self.raise_exception_on_error(ret, f"could not register on secondary {self.m.vol.lvol.uuid}")
+            self.create_target_lvol2(self.m.target_node_pri, self.m.vol)
+            uuid1, _ = client2.lvol_exists(self.m.target_node_sec, self.m.vol)
+            if not uuid1:
+                ret = client2.bdev_lvol_register(self.m.vol.lvol.lvol_bdev,
+                                                 self.m.target_node_sec.lvstore,
+                                                 self.m.vol.lvol.blobid,
+                                                 self.m.vol.lvol.lvol_uuid)
+                self.raise_exception_on_error(ret,
+                                              f"could not register on secondary {self.m.vol.lvol.uuid}")
 
         self.connect_hublvol(self.m.node_pri)
 
-        uuid, map_id = client3.lvol_exists(self.m.target_node_pri.lvstore,self.m.vol.lvol.lvol_bdev)
+        uuid, map_id = client3.lvol_exists(self.m.target_node_pri.lvstore,
+                                           self.m.vol.lvol.lvol_bdev)
         if not uuid or not map_id:
-            raise  RuntimeError(
+            raise RuntimeError(
                 f"migration {self.m.uuid}: could not get mapid of volume: {self.m.vol.lvol.uuid}")
         last_snap_uuid = (self.m.snapshots)[-1].snap.snap_uuid
-        ret = client1.bdev_lvol_final_migration(self.m.vol.lvol.lvol_bdev,map_id,
-                                                last_snap_uuid,4,self.m.target_node_pri.hublvol.nqn)
-        self.raise_exception_on_error(ret, f"could not initiate final lvol migration: {self.m.vol.lvol.uuid}")
+        ret = client1.bdev_lvol_final_migration(self.m.vol.lvol.lvol_bdev,
+                                                map_id,
+                                                last_snap_uuid, 4,
+                                                self.m.target_node_pri.hublvol.nqn)
+        self.raise_exception_on_error(ret,
+                                      f"could not initiate final lvol migration: {self.m.vol.lvol.uuid}")
         return True
 
     def delete_hublvol_controller(self):
@@ -410,7 +463,7 @@ def delete_hublvol_controller(self):
 
     def reconnect_subsystems(self):
 
-        #if "error" in ret:
+        # if "error" in ret:
         #    raise f"migration {self.m.uuid}: could not convert lvol to snapshot: {s.uuid} to remote subsystem:  {ret["error"]["message"]}:{ret["error"]["code"]}"
         return
 
@@ -418,111 +471,114 @@ def cleanup_migration(self, status: bool):
         db_controller = DBController()
         real_snapshots = db_controller.get_snapshots()
         self.unfreeze_objects()
-        #Migration was not successful
+        # Migration was not successful
         try:
-          if self.m.status >= MigrationState.HUBLVOL_CONNECTED:
-              self.delete_hublvol_controller()
-          if not status:
-              pri_node=self.m.node_pri
-              sec_node=self.m.node_sec
-          else:
-              pri_node = self.m.target_node_pri
-              sec_node = self.m.target_node_sec
-
-          if (self.m.status >= MigrationState.TARGET_LVOL_CREATED and not status) or self.m.status == MigrationState.DONE:
-              self.delete_subsystem(pri_node, self.m.vol.lvol.nqn, self.m.vol.lvol)
-              self.delete_subsystem(sec_node, self.m.vol.lvol.uuid, )
-              self.delete_lvol_from_node(pri_node, self.m.vol.lvol.uuid, True)
-              self.(sec_node, self.m.vol.lvol.uuid)
-
-          snaps = self.m.snapshots
-          snaps.reverse()
-          for sn in snaps:
-                     if sn.snap.uuid:
-                        rsn = db_controller.get_snapshot_by_id(sn.snap.uuid)
-                        if len(rsn.successor)==1:
-
-
-
-                            self.delete_lvol_from_node(pri_node, sn.snap.uuid, True)
-                            self.delete_subsystem(pri_node,sn.snap.uuid)
-                            self.delete_lvol_from_node(sec_node, sn.snap.uuid)
-                        else:
-                            break
+            if self.m.status >= MigrationState.HUBLVOL_CONNECTED:
+                self.delete_hublvol_controller()
+            if not status:
+                pri_node = self.m.node_pri
+                sec_node = self.m.node_sec
+            else:
+                pri_node = self.m.target_node_pri
+                sec_node = self.m.target_node_sec
+
+            if (self.m.status >= MigrationState.TARGET_LVOL_CREATED and not status) \
+                    or self.m.status == MigrationState.DONE:
+                self.delete_subsystem(pri_node, self.m.vol.lvol.nqn, self.m.vol.lvol)
+                self.delete_subsystem(sec_node, self.m.vol.lvol.uuid, )
+                self.delete_lvol_from_node(pri_node, self.m.vol.lvol.uuid, True)
+                # self.(sec_node, self.m.vol.lvol.uuid)
+
+            snaps = self.m.snapshots
+            snaps.reverse()
+            for sn in snaps:
+                if sn.snap.uuid:
+                    rsn = db_controller.get_snapshot_by_id(sn.snap.uuid)
+                    if len(rsn.successor) == 1:
+                        self.delete_lvol_from_node(pri_node, sn.snap.uuid, True)
+                        self.delete_subsystem(pri_node, sn.snap.uuid)
+                        self.delete_lvol_from_node(sec_node, sn.snap.uuid)
+                    else:
+                        break
         except:
             raise f"cleanup of migration not successful, will try later {self.m.uuid}"
         return True
 
     def migrate_final_lvol(self):
-      try:
-        if self.m.status==MigrationState.SNAPS_MIGRATED:
-           self.transfer_data_final()
-        elif self.m.status==MigrationState.TARGET_LVOL_CREATED:
-           self.connect_hublvol()
-        elif self.m.status==MigrationState.HUBLVOL_CONNECTED:
-           self.transfer_data_final()
-        elif self.m.status==MigrationState.TRANSFERRED_TO_TARGET:
-           self.reconnect_subsystems()
-        elif self.m.status == MigrationState.RECONNECT_DONE:
-           self.cleanup_migration(True)
-      except:
-        raise f"cannot transfer to target: {self.m.vol.lvol.uuid}"
-      return True
+        try:
+            if self.m.status == MigrationState.SNAPS_MIGRATED:
+                self.transfer_data_final()
+            elif self.m.status == MigrationState.TARGET_LVOL_CREATED:
+                self.connect_hublvol()
+            elif self.m.status == MigrationState.HUBLVOL_CONNECTED:
+                self.transfer_data_final()
+            elif self.m.status == MigrationState.TRANSFERRED_TO_TARGET:
+                self.reconnect_subsystems()
+            elif self.m.status == MigrationState.RECONNECT_DONE:
+                self.cleanup_migration(True)
+        except:
+            raise f"cannot transfer to target: {self.m.vol.lvol.uuid}"
+        return True
 
     def migrate_snaps(self):
-        if self.m.status==MigrationState.RUNNING:
-          try:
-            all_snaps_done = True
-            p=""
-            for s in self.m.snapshots:
-              if s.status is not ObjectMigrationState.DONE:
-                  all_snaps_done = False
-              if s.status in ObjectMigrationState.NEW:
-                  self.create_target_lvol(s)
-              elif s.status in ObjectMigrationState.LVOL_CREATED:
-                  self.set_mig_status(self.m.target_node_pri,s)
-              elif s.status in ObjectMigrationState.MIG_FLAG_SET:
-                  self.export_lvol(s)
-              elif s.status in ObjectMigrationState.LVOL_EXPORTED:
-                  self.connect_lvol(s)
-              elif s.status in ObjectMigrationState.LVOL_CONNECTED:
-                  self.transfer_data(s, 0)
-              elif s.status==ObjectMigrationState.TRANSFERRED:
-                   self.convert_lvol(s,p)
-              elif s.status == ObjectMigrationState.CONVERTED:
-                   self.delete_subsystem(self.m.target_node_pri,s.snap.uuid)
-              elif s.status == ObjectMigrationState.CLEANING:
-                   self.delete_lvol_from_node(self.m.target_node_sec, s.snap.uuid)
-              p=s
-            if self.m.rerun < 3 or self.time_difference()>5:
-                ret, snap_uuid=self.create_snapshot(self.m.vol)
-                sn=self.snap_assign(self.m.vol,snap_uuid)
-                self.m.snapshots.append(sn)
-                self.prev_time=datetime.now()
-                self.migrate_snaps()
-            elif all_snaps_done:
-                self.m.status = MigrationState.SNAPS_MIGRATED
-                self.m.write_to_db(self.db_controller.kv_store)
-                self.migrate_final_lvol()
-          except:
-               self.m.pre_status = self.m.status
-               self.m.status = MigrationState.FAILED
-               self.cleanup_migration(False)
+        if self.m.status == MigrationState.RUNNING:
+            try:
+                all_snaps_done = True
+                p = ""
+                for s in self.m.snapshots:
+                    if s.status is not ObjectMigrationState.DONE:
+                        all_snaps_done = False
+                    if s.status in ObjectMigrationState.NEW:
+                        self.create_target_lvol(s)
+                    elif s.status in ObjectMigrationState.LVOL_CREATED:
+                        self.set_mig_status(self.m.target_node_pri, s)
+                    elif s.status in ObjectMigrationState.MIG_FLAG_SET:
+                        self.export_lvol(s)
+                    elif s.status in ObjectMigrationState.LVOL_EXPORTED:
+                        self.connect_lvol(s)
+                    elif s.status in ObjectMigrationState.LVOL_CONNECTED:
+                        self.transfer_data(s, 0)
+                    elif s.status == ObjectMigrationState.TRANSFERRED:
+                        self.convert_lvol(s, p)
+                    elif s.status == ObjectMigrationState.CONVERTED:
+                        self.delete_subsystem(self.m.target_node_pri,
+                                              s.snap.uuid)
+                    elif s.status == ObjectMigrationState.CLEANING:
+                        self.delete_lvol_from_node(self.m.target_node_sec,
+                                                   s.snap.uuid)
+                    p = s
+                if self.m.rerun < 3 or self.time_difference() > 5:
+                    ret, snap_uuid = self.create_snapshot(self.m.vol)
+                    sn = self.snap_assign(self.m.vol, snap_uuid)
+                    self.m.snapshots.append(sn)
+                    self.prev_time = datetime.now()
+                    self.migrate_snaps()
+                elif all_snaps_done:
+                    self.m.status = MigrationState.SNAPS_MIGRATED
+                    self.m.write_to_db(self.db_controller.kv_store)
+                    self.migrate_final_lvol()
+            except:
+                self.m.pre_status = self.m.status
+                self.m.status = MigrationState.FAILED
+                self.cleanup_migration(False)
+
+            self.m.write_to_db(self.db_controller.kv_store)
         return True
 
-    def lvol_migrate(self, lvol: LVol, target_node: StorageNode, m: MigrationObject=None):
-        """Migrate a logical volume and its snapshots/clones."""
+    def lvol_migrate(self, lvol: LVol, target_node: StorageNode,
+                     m: MigrationObject = None):
+        """Initiate migration of a logical volume and its snapshots/clones."""
 
         # if this Migration Object does not exist (first call to lvol_migrate):
         if not m:
-          try:
-            self.m = MigrationObject()
-            self.m.uuid = str(uuid.uuid4())
-            self.m.create_dt = str(datetime.datetime)
-            self.m.status = MigrationState.NEW
-            self.m.write_to_db(self.db_controller.kv_store)
-          except:
-              return False #not even in database, lvol_migrate call must be repeated
+            try:
+                self.m = MigrationObject()
+                self.m.uuid = str(uuid.uuid4())
+                self.m.create_dt = str(datetime.datetime)
+                self.m.status = MigrationState.NEW
+                self.m.write_to_db(self.db_controller.kv_store)
+            except:
+                return False  # not even in database, lvol_migrate call must be repeated
         else:
             self.m = m
 
@@ -532,11 +588,13 @@ def lvol_migrate(self, lvol: LVol, target_node: StorageNode, m: MigrationObject=
             lvol.write_to_db(self.db_controller.kv_store)
 
             # copy now all data from the lvol to the migration lvol (temporary object for lvol during migration)
-
-            self.m.node_pri = StorageNode(self.db_controller.get_storage_node_by_id(lvol.node_id))
-            self.m.node_sec = self.db_controller.get_storage_node_by_id(self.m.node_pri.secondary_node_id)
+            self.m.node_pri = StorageNode(
+                self.db_controller.get_storage_node_by_id(lvol.node_id))
+            self.m.node_sec = self.db_controller.get_storage_node_by_id(
+                self.m.node_pri.secondary_node_id)
             self.m.target_node_pri = target_node
-            self.m.target_node_sec = self.db_controller.get_storage_node_by_id(self.m.target_node_pri.secondary_node_id)
+            self.m.target_node_sec = self.db_controller.get_storage_node_by_id(
+                self.m.target_node_pri.secondary_node_id)
 
             self.m.vol = self.lvol_assign(lvol)
 
@@ -559,80 +617,95 @@ def lvol_migrate(self, lvol: LVol, target_node: StorageNode, m: MigrationObject=
                     self.m.snapshots.append(sr)
         except:
             return True
-        self.m.status=MigrationState.RUNNING
+        self.m.status = MigrationState.RUNNING
         self.m.write_to_db(self.db_controller.kv_store)
         self.migrate_snaps()
         return True
 
-        if self.check_nodes_online():
-            self.m.status = MigrationState.RUNNING
-            self.m.write_to_db(self.db_controller.kv_store)
-            self.migrate_snaps()
-            return True
-        else:
-            logger.warning(f"Not all nodes online. Suspending lvol life migration {lvol.uuid}")
-            self.m.write_to_db(self.db_controller.kv_store)
-            return -1
+    def continue_migration(self, m):
+        try:
+            if m.status != MigrationState.DONE and m.status != MigrationState.FAILED:
+                if self.check_nodes_online():
+                    if m.status == MigrationState.NEW:
+                        self.lvol_migrate(m.vol.lvol, m.node_pri, m)
+                    elif m.status == MigrationState.RUNNING:
+                        for q in m.completion_poll_queue:
+                            m.completion_poll_queue.remove(q)
+                            if q.status == ObjectMigrationState.TRANSFER:
+                                result, offset = self.get_transfer_state(
+                                    self.m.node_pri, q.retry)
+                                if not result:
+                                    if q.retry > 5:
+                                        raise (f"could not transfer snapshot. max retries. "
+                                               f"name: {q.snap.lvol.lvs_name + "/" + q.snap.snap_bdev}. "
+                                               f"uuid: {q.snap.uuid}")
+                                    q.retry += 1
+                                    self.transfer_data(self.m.node_pri,
+                                                       q, offset)
+                                    m.completion_poll_queue.append(q)
+                        self.migrate_snaps()
+                    else:
+                        self.migrate_final_lvol()
+        except:
+            logger.error(
+                f"migration controller exception. Migration failed: {self.m.uuid} ")
+            self.m.status = MigrationState.FAILED
+            self.cleanup_migration(False)
+            return False
+        return True
 
+    # TODO: delete. Not used with task runner
     def check_status_migration(self, on_restart: bool):
-      while True:
-          sleep(10)
-          try:
-            migrations=self.db_controller.get_migrations()
-            for m in migrations:
-              if m.status!=MigrationState.DONE and m.status!=MigrationState.FAILED:
-                 if self.check_nodes_online():
-                     if m.status==MigrationState.NEW:
-                         self.lvol_migrate(m.vol.lvol,m.node_pri,m)
-                     elif m.status==MigrationState.RUNNING:
-                         for q in m.completion_poll_queue:
-                             m.completion_poll_queue.remove(q)
-                             if q.status==ObjectMigrationState.TRANSFER:
-                                 result, offset = self.get_transfer_state(self.m.node_pri,q.retry)
-                                 if not result:
-                                     if q.retry > 5:
-                                         raise f"could not transfer snapshot. max retries. name: {q.snap.lvol.lvs_name + "/" + q.snap.snap_bdev}. uuid: {q.snap.uuid}"
-                                     q.retry += 1
-                                     self.transfer_data(self.m.node_pri,q,offset)
-                                     m.completion_poll_queue.append(q)
-                         self.migrate_snaps()
-                     else:
-                          self.migrate_final_lvol()
-          except:
-              logger.error(f"migration controller exception. Migration failed: {self.m.uuid} ")
-              self.m.status=MigrationState.FAILED
-              self.cleanup_migration(False)
-              return False
-          return True
-
-    migrate_lock = threading.Lock()
+        while True:
+            sleep(10)
+            try:
+                migrations = self.db_controller.get_migrations()
+                for m in migrations:
+                    if m.status != MigrationState.DONE and m.status != MigrationState.FAILED:
+                        if self.check_nodes_online():
+                            if m.status == MigrationState.NEW:
+                                self.lvol_migrate(m.vol.lvol, m.node_pri, m)
+                            elif m.status == MigrationState.RUNNING:
+                                for q in m.completion_poll_queue:
+                                    m.completion_poll_queue.remove(q)
+                                    if q.status == ObjectMigrationState.TRANSFER:
+                                        result, offset = self.get_transfer_state(
+                                            self.m.node_pri, q.retry)
+                                        if not result:
+                                            if q.retry > 5:
+                                                raise f"could not transfer snapshot. max retries. name: {q.snap.lvol.lvs_name + "/" + q.snap.snap_bdev}. uuid: {q.snap.uuid}"
+                                            q.retry += 1
+                                            self.transfer_data(self.m.node_pri,
+                                                               q, offset)
+                                            m.completion_poll_queue.append(q)
+                                self.migrate_snaps()
+                            else:
+                                self.migrate_final_lvol()
+            except:
+                logger.error(
+                    f"migration controller exception. Migration failed: {self.m.uuid} ")
+                self.m.status = MigrationState.FAILED
+                self.cleanup_migration(False)
+                return False
+            return True
 
     def add_new_migration(self, lvol, target_node: StorageNode):
-      with self.migrate_lock:
+        with self.migrate_lock:
             try:
-              migrations = self.db_controller.get_migrations()
-              for m in migrations:
-                if lvol.node_id==m.vol.lvol.node_id and (m.status!=MigrationState.DONE or m.status!=MigrationState.FAILED_AND_CLEANED):
-                   raise exception("cannot add migration - ongoing migration")
-              self.lvol_migrate(lvol, target_node)
+                migrations = self.db_controller.get_migrations()
+                for m in migrations:
+                    if lvol.node_id == m.vol.lvol.node_id and m.status not in [
+                        MigrationState.DONE, MigrationState.FAILED_AND_CLEANED]:
+                        raise exception(
+                            "cannot add migration - ongoing migration")
             except:
-              logger.error(f"could not add lvol {lvol.uuid} for migration as another migration is currently running.")
-              return False
-            return self.lvol_migrate(lvol,target_node)
-
-    def start_service(self, on_restart=False):
-        """
-        Starts the migration checker in a background thread.
-        """
-        self._thread = threading.Thread(
-            target=self.check_status_migration, args=(on_restart,), daemon=True
-        )
-        self._thread.start()
-
-    def stop_service(self):
-        """
-        Stops the background service gracefully.
-        """
-        self._stop_event.set()
-        self._thread.join()
+                logger.error(
+                    f"could not add lvol {lvol.uuid} for migration as another migration is currently running.")
+                return False
+            return self.lvol_migrate(lvol, target_node)
+
+    def cancel_migration(self):
+        self.m.status = MigrationState.FAILED
+        self.cleanup_migration(False)
+        self.m.write_to_db(self.db_controller.kv_store)
 
diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py
index b7c434f63..1a00d9f15 100644
--- a/simplyblock_core/controllers/tasks_controller.py
+++ b/simplyblock_core/controllers/tasks_controller.py
@@ -75,6 +75,11 @@ def _add_task(function_name, cluster_id, node_id, device_id,
         if task_id:
             logger.info(f"Task found, skip adding new task: {task_id}")
             return False
+    elif function_name == JobSchedule.FN_LVOL_MIGRATION:
+        task_id = get_lvol_mig_task(cluster_id, node_id, function_params['lvol_id'])
+        if task_id:
+            logger.info(f"Task found, skip adding new task: {task_id}")
+            return False
 
     task_obj = JobSchedule()
     task_obj.uuid = str(uuid.uuid4())
@@ -397,6 +402,7 @@ def add_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name):
     return _add_task(JobSchedule.FN_LVOL_SYNC_DEL, cluster_id, node_id, "",
                      function_params={"lvol_bdev_name": lvol_bdev_name}, max_retry=10)
 
+
 def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None):
     tasks = db.get_job_tasks(cluster_id)
     for task in tasks:
@@ -409,3 +415,36 @@ def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None):
                     return task.uuid
     return False
 
+
+def get_active_lvol_mig_task(cluster_id, node_id):
+    tasks = db.get_job_tasks(cluster_id)
+    for task in tasks:
+        if task.function_name == JobSchedule.FN_LVOL_MIGRATION \
+                and task.node_id == node_id:
+            if task.status == JobSchedule.STATUS_RUNNING \
+                    and task.canceled is False:
+                return task.uuid
+    return False
+
+
+def get_lvol_mig_task(cluster_id, node_id, lvol_id=None):
+    tasks = db.get_job_tasks(cluster_id)
+    for task in tasks:
+        if task.function_name == JobSchedule.FN_LVOL_SYNC_DEL and task.node_id == node_id :
+            if task.status != JobSchedule.STATUS_DONE and task.canceled is False:
+                if task.function_params["lvol_id"] == lvol_id:
+                    return task.uuid
+    return False
+
+
+def add_lvol_mig_task(cluster_id, lvol_id, target_node_id):
+    lvol = db.get_lvol_by_id(lvol_id)
+    params = {
+        "lvol_id": lvol_id,
+        "target_node_id": target_node_id,
+        "migration_id": None,
+    }
+
+    return _add_task(JobSchedule.FN_LVOL_MIGRATION, cluster_id, lvol.node_id,
+                     "",
+                     function_params=params, max_retry=10)
diff --git a/simplyblock_core/db_controller.py b/simplyblock_core/db_controller.py
index 5cf48e976..bfe9500b4 100644
--- a/simplyblock_core/db_controller.py
+++ b/simplyblock_core/db_controller.py
@@ -313,3 +313,9 @@ def get_migrations(self) -> List[MigrationObject]:
         for m in ret:
             migrations.append(m)
         return sorted(migrations, key=lambda x: x.create_dt)
+
+    def get_migration_by_id(self, id) -> MigrationObject:
+        migrations = MigrationObject().read_from_db(self.kv_store, id=id)
+        if not migrations:
+            raise KeyError(f'Migration {id} not found')
+        return migrations[0]
diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py
index bbdcd7871..db2f6a562 100644
--- a/simplyblock_core/models/job_schedule.py
+++ b/simplyblock_core/models/job_schedule.py
@@ -23,6 +23,7 @@ class JobSchedule(BaseModel):
     FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add"
     FN_JC_COMP_RESUME = "jc_comp_resume"
     FN_LVOL_SYNC_DEL = "lvol_sync_del"
+    FN_LVOL_MIGRATION = "lvol_migration"
 
     canceled: bool = False
     cluster_id: str = ""
diff --git a/simplyblock_core/services/tasks_runner_lvol_migration.py b/simplyblock_core/services/tasks_runner_lvol_migration.py
new file mode 100644
index 000000000..e95b0b720
--- /dev/null
+++ b/simplyblock_core/services/tasks_runner_lvol_migration.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+import time
+from datetime import datetime, timezone
+
+from simplyblock_core import db_controller, utils, constants
+from simplyblock_core.controllers import tasks_controller
+from simplyblock_core.controllers.lvol_migration_controller import \
+    MigrationController
+from simplyblock_core.models.cluster import Cluster
+from simplyblock_core.models.job_schedule import JobSchedule
+
+
+
+from simplyblock_core.models.nvme_device import NVMeDevice
+from simplyblock_core.models.storage_node import StorageNode
+from simplyblock_core.rpc_client import RPCClient
+
+
+def task_runner(task):
+    try:
+        snode = db.get_storage_node_by_id(task.node_id)
+    except KeyError:
+        task.status = JobSchedule.STATUS_DONE
+        task.function_result = f"Node not found: {task.node_id}"
+        task.write_to_db(db.kv_store)
+        return True
+
+    migration_id = task.function_params.get("migration_id")
+    if migration_id:
+        try:
+            migration = db.get_migration_by_id(migration_id)
+        except KeyError:
+            task.status = JobSchedule.STATUS_DONE
+            task.function_result = f"Migration not found: {migration_id}"
+            task.write_to_db(db.kv_store)
+            return True
+    else:
+        migration = None
+
+    migration_controller.m = migration
+
+    if task.canceled:
+        task.function_result = "canceled"
+        if migration:
+            # initiate cancelling of migration that was is progress
+            migration_controller.cancel_migration()
+        else:
+            task.status = JobSchedule.STATUS_DONE
+            task.write_to_db(db.kv_store)
+            return True
+        return False
+
+    if snode.status != StorageNode.STATUS_ONLINE:
+        task.function_result = "node is not online, retrying"
+        task.retry += 1
+        task.status = JobSchedule.STATUS_SUSPENDED
+        task.write_to_db(db.kv_store)
+        return False
+
+    cluster = db.get_cluster_by_id(task.cluster_id)
+    if cluster.status not in [Cluster.STATUS_ACTIVE, Cluster.STATUS_DEGRADED, Cluster.STATUS_READONLY]:
+        task.function_result = "cluster is not active, retrying"
+        task.status = JobSchedule.STATUS_SUSPENDED
+        task.retry += 1
+        task.write_to_db(db.kv_store)
+        return False
+
+    lvol = db.get_lvol_by_id(task.function_params["lvol_id"])
+    target_node = db.get_storage_node_by_id(task.function_params["target_node_id"])
+
+    if task.status in [JobSchedule.STATUS_NEW, JobSchedule.STATUS_SUSPENDED]:
+        # initiate migration
+        result = migration_controller.lvol_migrate(lvol, target_node, migration)
+        if result:
+            task.function_result = JobSchedule.STATUS_RUNNING
+            task.status = JobSchedule.STATUS_RUNNING
+            task.write_to_db(db.kv_store)
+
+
+    task.retry += 1
+    task.write_to_db(db.kv_store)
+    return False
+
+
+logger = utils.get_logger(__name__)
+migration_controller = MigrationController()
+# get DB controller
+db = db_controller.DBController()
+logger.info("Starting Tasks runner...")
+while True:
+    time.sleep(3)
+    clusters = db.get_clusters()
+    if not clusters:
+        logger.error("No clusters found!")
+    else:
+        for cl in clusters:
+            tasks = db.get_job_tasks(cl.get_id(), reverse=False)
+            for task in tasks:
+                if task.function_name == JobSchedule.FN_LVOL_MIGRATION:
+                    if task.status in [JobSchedule.STATUS_NEW, JobSchedule.STATUS_SUSPENDED]:
+                        active_task = tasks_controller.get_active_lvol_mig_task(
+                            task.cluster_id, task.node_id)
+                        if active_task:
+                            logger.info("task found on same node, retry")
+                            continue
+                    if task.status != JobSchedule.STATUS_DONE:
+                        # get new task object because it could be changed from cancel task
+                        task = db.get_task_by_id(task.uuid)
+                        res = task_runner(task)
+                        if not res:
+                            time.sleep(2)
diff --git a/simplyblock_core/storage_node_ops.py b/simplyblock_core/storage_node_ops.py
index ea9186960..a6d89b74d 100644
--- a/simplyblock_core/storage_node_ops.py
+++ b/simplyblock_core/storage_node_ops.py
@@ -22,12 +22,10 @@
 from simplyblock_core.constants import LINUX_DRV_MASS_STORAGE_NVME_TYPE_ID, LINUX_DRV_MASS_STORAGE_ID
 from simplyblock_core.controllers import lvol_controller, storage_events, snapshot_controller, device_events, \
     device_controller, tasks_controller, health_controller, tcp_ports_events, qos_controller
-from simplyblock_core.controllers.lvol_migration_controller import MigrationController
 from simplyblock_core.db_controller import DBController
 from simplyblock_core.fw_api_client import FirewallClient
 from simplyblock_core.models.iface import IFace
 from simplyblock_core.models.job_schedule import JobSchedule
-from simplyblock_core.models.lvol_migration import MigrationState
 from simplyblock_core.models.lvol_model import LVol
 from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
 from simplyblock_core.models.snapshot import SnapShot
@@ -128,15 +126,6 @@ def connect_device(name: str, device: NVMeDevice, node: StorageNode, bdev_names:
 
     return bdev_name
 
-#if a node was rebooted during an ongoing migration,
-def restart_migration(node:StorageNode):
-    db_controller = DBController()
-    migs=db_controller.get_migrations()
-    for m in migs:
-        if m.node_pri==node.uuid:
-          if m.status!=MigrationState.DONE:
-            add_task()
-    return
 
 def get_next_cluster_device_order(db_controller, cluster_id):
     max_order = 0
@@ -2056,7 +2045,6 @@ def restart_storage_node(
                     online_devices_list.append(dev.get_id())
             if online_devices_list:
                 tasks_controller.add_device_mig_task(online_devices_list, snode.cluster_id)
-            restart_migration(snode)
             return True