From 8ceb83038965fef4399718ea0b4abc3b21e84261 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 27 Apr 2026 12:39:40 +0100
Subject: [PATCH 01/36] introduce device.py and set up branch

---
 pyop3/device.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 pyop3/device.py

diff --git a/pyop3/device.py b/pyop3/device.py
new file mode 100644
index 0000000000..45048e3968
--- /dev/null
+++ b/pyop3/device.py
@@ -0,0 +1 @@
+# File to handle op3.device context manager

From ba6de9b3cef1ad8f017664d3a651c04131f83632 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 28 Apr 2026 09:19:49 +0100
Subject: [PATCH 02/36] const parameter for host device

---
 pyop3/__init__.py |  4 ++++
 pyop3/device.py   | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/pyop3/__init__.py b/pyop3/__init__.py
index 1cfc849fba..45ac9ebef1 100644
--- a/pyop3/__init__.py
+++ b/pyop3/__init__.py
@@ -87,6 +87,10 @@ def _init_likwid():
     exscan,
     AssignmentType,
 )
+from pyop3.device import ( # noqa: F401
+    HOST_DEVICE,
+    Device
+)
 from pyop3.sf import StarForest, single_star_sf, local_sf
 import pyop3.sf
 from pyop3.tree.index_tree.parse import as_index_forest
diff --git a/pyop3/device.py b/pyop3/device.py
index 45048e3968..69ad5d7b46 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -1 +1,11 @@
 # File to handle op3.device context manager
+import abc
+import contextlib
+import warnings
+
+# TODO: Consider if there is approach to actually identify host method calling the python script
+# For now, we just have a constant argument.
+HOST_DEVICE = "cpu" 
+
+class Device(abc.ABC):
+  pass

From 6a9fd213cd8d84aeb65a8f45b6c749570ebaa4be Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 28 Apr 2026 12:16:28 +0100
Subject: [PATCH 03/36] include gpu demo and update petsc version as 3.24.5
 misaligned for PCPatch*ExteriorFacets

---
 pyop3_gpu_demo.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml    |  2 +-
 2 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 pyop3_gpu_demo.py

diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
new file mode 100644
index 0000000000..9a67fdfa22
--- /dev/null
+++ b/pyop3_gpu_demo.py
@@ -0,0 +1,69 @@
+"""
+Useful links:
+
+  * https://github.com/firedrakeproject/firedrake/blob/main/.github/workflows/core.yml#L476
+
+    How to build a GPU-enabled Firedrake.
+
+  * https://github.com/firedrakeproject/firedrake/blob/connorjward/pyop3-gpu/pyop3/device.py
+
+    An implementation of the 'device' context manager. It needs a big refactor.
+
+  * https://github.com/OP2/PyOP2/pull/691/changes#diff-f8765d963b5adb1788f453e259d8cd45f29cee9670563ddb99b9fe2bba115a12
+
+    Using a wrapper type to track changes between host and device. In pyop3
+    this would be the 'ArrayBuffer' object and link into existing
+    state tracking.
+"""
+
+import numpy as np
+
+from firedrake import *
+import pyop3 as op3
+
+
+# made up API, we need some way to identify the device
+host = op3.HOST_DEVICE  # or similar
+gpu = op3.Device()
+
+mesh = UnitSquareMesh(3, 3)
+V = FunctionSpace(mesh, "P", 2)
+
+f = Function(V).assign(10)
+g = Function(V)
+
+assert isinstance(f.dat.data_ro, np.ndarray)
+assert isinstance(g.dat.data_ro, np.ndarray)
+
+# state tracking checks, .buffer.state is now device-specific
+assert f.dat.buffer.state[host] == 1  # modified once
+assert f.dat.buffer.state[gpu] == 0  # untouched
+assert g.dat.buffer.state[host] == 0  # untouched
+assert g.dat.buffer.state[gpu] == 0  # untouched
+
+with op3.device(gpu):
+    # Getting the .data attribute on the GPU should give us back a GPU array type
+    assert not isinstance(f.dat.data_ro, np.ndarray)
+    assert not isinstance(g.dat.data_ro, np.ndarray)
+
+    # Do the assignment using array operations
+    g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="array")
+
+    # Do the assignment using MLIR (this is a later step)
+    # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile")
+
+    # state tracking checks
+    assert f.dat.buffer.state[host] == 1  # modified once
+    assert f.dat.buffer.state[gpu] == 1  # matches host
+    assert g.dat.buffer.state[host] == 0  # untouched
+    assert g.dat.buffer.state[gpu] == 1  # modified once
+
+assert isinstance(f.dat.data_ro, np.ndarray)
+assert isinstance(g.dat.data_ro, np.ndarray)
+assert (g.dat.data_ro == 23).all()
+
+# state tracking checks
+assert f.dat.buffer.state[host] == 1  # modified once
+assert f.dat.buffer.state[gpu] == 1  # matches host
+assert g.dat.buffer.state[host] == 1  # matches device
+assert g.dat.buffer.state[gpu] == 1  # modified once
diff --git a/pyproject.toml b/pyproject.toml
index 3b2a1003e4..1e9153c5d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
   "numpy",
   "packaging",
   # TODO RELEASE
-  # "petsc4py==3.24.5",
+  # "petsc4py==3.25.0",
   # UNDO ME
   "petsctools @ git+https://github.com/firedrakeproject/petsctools.git@connorjward/cpetsc",
   "pkgconfig",

From fb3d0b7885f09ecc118a70f8cd854d952bd31036 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 28 Apr 2026 13:57:52 +0100
Subject: [PATCH 04/36] noting areas for change

---
 pyop3/buffer.py |  8 +++++++-
 pyop3/device.py | 26 +++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index e03d964f32..c1f0fb3160 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -226,16 +226,21 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
 
     # {{{ Instance attrs
 
+    # TODO: Update to lazily allocated pair of host/device arrays
     _lazy_data: np.ndarray = dataclasses.field(repr=False)
     sf: StarForest
     _name: str
     _constant: bool
     _rank_equal: bool
     _ordered: bool
+    # TODO: Device awareness variable - subscriber/observer? 
+    _awareness: int
 
     _max_value: np.number | None = None
 
-    _state: int = 0
+    # TODO: Mutable pair of variables corresponding to state for host & device  
+    # Not done as dataclass requires immutable or default factory but latter cannot be attr
+    _state: int = 0 
 
     # flags for tracking parallel correctness
     _leaves_valid: bool = True
@@ -464,6 +469,7 @@ def assemble(self) -> None:
     def leaves_valid(self) -> bool:
         return self._leaves_valid
 
+    # TODO: Update this to provide CPU/GPU array as per awareness
     @property
     def _data(self):
         if self._lazy_data is None:
diff --git a/pyop3/device.py b/pyop3/device.py
index 69ad5d7b46..363e2e7bf3 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -1,11 +1,27 @@
 # File to handle op3.device context manager
-import abc
+from abc import ABCMeta, abstractmethod
 import contextlib
 import warnings
 
-# TODO: Consider if there is approach to actually identify host method calling the python script
-# For now, we just have a constant argument.
-HOST_DEVICE = "cpu" 
+# TODO: The constant should be elsewhere but temporary here 
+HOST_DEVICE = 0 
 
-class Device(abc.ABC):
+class Device(metaclass=ABCMeta):
+  _available: bool
+  _name: str
+
+  def __init__(self):
+    pass
+
+class GPU(Device):
   pass
+
+@contextlib.contextmanager
+def device(dev: Device): 
+  # TODO: Update buffers and copy to
+  try:
+    yield "not implemented"
+  finally:
+    # TODO: Update buffers and copy back
+    pass
+    

From e39f2150e5d8fa0af8677f707f8c94e6e824b0bc Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 28 Apr 2026 16:41:33 +0100
Subject: [PATCH 05/36] introducing context variable and lazy cupy

---
 pyop3/__init__.py |  3 ++-
 pyop3/buffer.py   | 26 ++++++++++++++++++++------
 pyop3/device.py   | 24 +++++++++++++++++++++---
 pyop3_gpu_demo.py | 13 +++++++------
 4 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/pyop3/__init__.py b/pyop3/__init__.py
index 45ac9ebef1..e5590ac2d1 100644
--- a/pyop3/__init__.py
+++ b/pyop3/__init__.py
@@ -89,7 +89,8 @@ def _init_likwid():
 )
 from pyop3.device import ( # noqa: F401
     HOST_DEVICE,
-    Device
+    GPU,
+    offloading
 )
 from pyop3.sf import StarForest, single_star_sf, local_sf
 import pyop3.sf
diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index c1f0fb3160..4197e4ea81 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -11,6 +11,7 @@
 from typing import Any, ClassVar, Hashable
 
 import numpy as np
+import cupy as cp
 from mpi4py import MPI
 from petsc4py import PETSc
 
@@ -20,6 +21,7 @@
 from pyop3.dtypes import IntType, ScalarType, DTypeT
 from pyop3.sf import DistributedObject, NullStarForest, StarForest, local_sf
 from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly
+from pyop3.device import _current_device
 
 from ._buffer_cy import set_petsc_mat_diagonal
 
@@ -227,7 +229,7 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
     # {{{ Instance attrs
 
     # TODO: Update to lazily allocated pair of host/device arrays
-    _lazy_data: np.ndarray = dataclasses.field(repr=False)
+    _lazy_data: np.ndarray | cp.ndarray = dataclasses.field(repr=False)
     sf: StarForest
     _name: str
     _constant: bool
@@ -472,11 +474,23 @@ def leaves_valid(self) -> bool:
     # TODO: Update this to provide CPU/GPU array as per awareness
     @property
     def _data(self):
-        if self._lazy_data is None:
-            self._lazy_data = np.zeros(self.shape, dtype=self.dtype)
-        if self.name == "array_247_buffer":
-            breakpoint()
-        return self._lazy_data
+        # NOTE: Testing context manager
+        _location = _current_device.get()
+
+        if _location == "cpu":
+          if self._lazy_data is None:
+              self._lazy_data = np.zeros(self.shape, dtype=self.dtype)
+          if self.name == "array_247_buffer":
+              breakpoint()
+          return self._lazy_data 
+        elif _location == "gpu":
+          if self._lazy_data is None:
+              self._lazy_data = cp.zeros(self.shape, dtype=self.dtype)
+          if self.name == "array_247_buffer":
+              breakpoint()
+          return cp.asarray(self._lazy_data)
+        else:
+          raise RuntimeError("Offload device not supported")
 
     # TODO: I think the halo bits should only be handled at the Dat level via the
     # axis tree. Here we can just consider the array.
diff --git a/pyop3/device.py b/pyop3/device.py
index 363e2e7bf3..73429073e1 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -1,27 +1,45 @@
 # File to handle op3.device context manager
 from abc import ABCMeta, abstractmethod
 import contextlib
+import contextvars
 import warnings
 
 # TODO: The constant should be elsewhere but temporary here 
-HOST_DEVICE = 0 
+HOST_DEVICE = "cpu" 
+
+# NOTE: Use contextvars to act as a bridge between buffer and manager classes
+_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
 
 class Device(metaclass=ABCMeta):
   _available: bool
   _name: str
 
   def __init__(self):
+    self._available = True
     pass
 
+  @abstractmethod
+  def __repr__(self):
+    pass 
+
 class GPU(Device):
+  
+  def __init__(self):
+    super().__init__()
+    self._name = "gpu"
   pass
 
+  def __repr__(self):
+    return self._name
+
 @contextlib.contextmanager
-def device(dev: Device): 
+def offloading(dev: Device): 
   # TODO: Update buffers and copy to
+  token = _current_device.set(dev)
   try:
-    yield "not implemented"
+    yield
   finally:
     # TODO: Update buffers and copy back
+    _current_device.reset(token)
     pass
     
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 9a67fdfa22..6d7ad4bb03 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -24,7 +24,7 @@
 
 # made up API, we need some way to identify the device
 host = op3.HOST_DEVICE  # or similar
-gpu = op3.Device()
+gpu = op3.GPU()
 
 mesh = UnitSquareMesh(3, 3)
 V = FunctionSpace(mesh, "P", 2)
@@ -36,13 +36,14 @@
 assert isinstance(g.dat.data_ro, np.ndarray)
 
 # state tracking checks, .buffer.state is now device-specific
-assert f.dat.buffer.state[host] == 1  # modified once
-assert f.dat.buffer.state[gpu] == 0  # untouched
-assert g.dat.buffer.state[host] == 0  # untouched
-assert g.dat.buffer.state[gpu] == 0  # untouched
+# assert f.dat.buffer.state[host] == 1  # modified once
+# assert f.dat.buffer.state[gpu] == 0  # untouched
+# assert g.dat.buffer.state[host] == 0  # untouched
+# assert g.dat.buffer.state[gpu] == 0  # untouched
 
-with op3.device(gpu):
+with op3.offloading(gpu):
     # Getting the .data attribute on the GPU should give us back a GPU array type
+
     assert not isinstance(f.dat.data_ro, np.ndarray)
     assert not isinstance(g.dat.data_ro, np.ndarray)
 

From 2a54e848fc4e5aa19bf0cea255b7451720558bda Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 29 Apr 2026 12:23:20 +0100
Subject: [PATCH 06/36] lazy evaluation of arrays

---
 pyop3/buffer.py     | 75 +++++++++++++++++++++++++++------------------
 pyop3/device.py     | 57 +++++++++++++++++++---------------
 pyop3/exceptions.py |  3 ++
 pyop3/utils.py      | 15 +++++++--
 pyop3_gpu_demo.py   |  8 ++---
 5 files changed, 98 insertions(+), 60 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 4197e4ea81..d9be4a46b9 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -21,7 +21,10 @@
 from pyop3.dtypes import IntType, ScalarType, DTypeT
 from pyop3.sf import DistributedObject, NullStarForest, StarForest, local_sf
 from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly
-from pyop3.device import _current_device
+from pyop3.device import (
+    _current_device,
+    HOST_DEVICE
+)
 
 from ._buffer_cy import set_petsc_mat_diagonal
 
@@ -224,25 +227,22 @@ def handle(self, *, nest_indices: tuple[tuple[int, ...], ...] = ()) -> Any:
 # copies should live in this class.
 @pyop3.record.record()
 class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
-    """A buffer whose underlying data structure is a numpy array."""
+    """A buffer whose underlying data structure is a lazily-evaluated NumPy/CuPy array."""
 
     # {{{ Instance attrs
 
     # TODO: Update to lazily allocated pair of host/device arrays
-    _lazy_data: np.ndarray | cp.ndarray = dataclasses.field(repr=False)
+    _lazy_data: dict[str, np.ndarray | cp.ndarray] = dataclasses.field(repr=False)
     sf: StarForest
     _name: str
     _constant: bool
     _rank_equal: bool
     _ordered: bool
-    # TODO: Device awareness variable - subscriber/observer? 
-    _awareness: int
+    # TODO: Mutable pair of variables corresponding to state for host & device  
+    _state: dict[str, int]
 
     _max_value: np.number | None = None
 
-    # TODO: Mutable pair of variables corresponding to state for host & device  
-    # Not done as dataclass requires immutable or default factory but latter cannot be attr
-    _state: int = 0 
 
     # flags for tracking parallel correctness
     _leaves_valid: bool = True
@@ -254,8 +254,10 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer,
             type(self), self._constant, self._rank_equal, self._ordered, 
             self.dtype, buffer_counter[self])
 
-    def __init__(self, data: np.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False):
-        data = data.flatten()
+    def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False):
+
+        data_mapping = {pyop3.HOST_DEVICE: None, "gpu": None}
+
         if sf is None:
             sf = NullStarForest(data.size)
         name = utils.maybe_generate_name(name, prefix, self.DEFAULT_PREFIX)
@@ -265,16 +267,24 @@ def __init__(self, data: np.ndarray, sf: StarForest | None = None, *, name: str|
         if rank_equal and not constant:
             raise ValueError
 
+        # TODO: CuPy has no support for `writeable` flag 
         if constant:
             data.flags.writeable = False
 
-        self._lazy_data = data
+        if isinstance(data, np.ndarray):
+            data_mapping[pyop3.HOST_DEVICE] = data
+        elif isinstance(data, cp.ndarray):
+            data_mapping["gpu"] = data 
+            
+        self._lazy_data = data_mapping 
         self.sf = sf
         self._name = name
         self._constant = constant
         self._rank_equal = rank_equal
         self._max_value = max_value
         self._ordered = ordered
+        # TODO: Quite ugly, maybe switch to ENUM in future.
+        self._state = {pyop3.HOST_DEVICE: 0, "gpu": 0}
         self.__post_init__()
 
     def __post_init__(self) -> None:
@@ -312,8 +322,10 @@ def dtype(self) -> np.dtype:
         return self._data.dtype
 
     def inc_state(self) -> None:
-        self._state += 1
+        self._state[self.context] += 1
 
+    # NOTE: Why is this using _lazy_data instead of _data?
+    # TODO: Either adjust for _lazy_data mapping or switch to _data
     def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         # make sure that there are no pending transfers before we copy
         self.assemble()
@@ -323,8 +335,16 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         else:
             data = np.zeros_like(self._lazy_data)
         return self.__record_init__(_name=name, _lazy_data=data)
+    
+    def sync_to_host(self):
+        self.state[pyop3.HOST_DEVICE] = cp.asnumpy(self.state["gpu"])
+        self.state[pyop3.HOST_DEVICE] = self.state["gpu"]
 
     is_nested: ClassVar[bool] = False
+    
+    @property
+    def context(self) -> str:
+        return str(_current_device.get())
 
     @property
     def handle(self) -> np.ndarray:
@@ -471,26 +491,23 @@ def assemble(self) -> None:
     def leaves_valid(self) -> bool:
         return self._leaves_valid
 
-    # TODO: Update this to provide CPU/GPU array as per awareness
+    # TODO: Need logic to recognise when to copy data to GPU
     @property
     def _data(self):
-        # NOTE: Testing context manager
-        _location = _current_device.get()
-
-        if _location == "cpu":
-          if self._lazy_data is None:
-              self._lazy_data = np.zeros(self.shape, dtype=self.dtype)
-          if self.name == "array_247_buffer":
-              breakpoint()
-          return self._lazy_data 
-        elif _location == "gpu":
-          if self._lazy_data is None:
-              self._lazy_data = cp.zeros(self.shape, dtype=self.dtype)
-          if self.name == "array_247_buffer":
-              breakpoint()
-          return cp.asarray(self._lazy_data)
+        context = self.context
+
+        if context == "cpu":
+            if self._lazy_data[context] is None:
+                self._lazy_data[context] = np.zeros(self.shape, dtype=self.dtype)
+        elif context == "gpu":
+            if self._lazy_data[context] is None:
+                self._lazy_data[context] = cp.zeros(self.shape, dtype=self.dtype)
         else:
-          raise RuntimeError("Offload device not supported")
+            raise RuntimeError("Offload device not supported")
+
+        if self.name == "array_247_buffer":
+            breakpoint()
+        return self._lazy_data[context]
 
     # TODO: I think the halo bits should only be handled at the Dat level via the
     # axis tree. Here we can just consider the array.
diff --git a/pyop3/device.py b/pyop3/device.py
index 73429073e1..2bbc0c0c83 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -5,41 +5,48 @@
 import warnings
 
 # TODO: The constant should be elsewhere but temporary here 
-HOST_DEVICE = "cpu" 
+HOST_DEVICE = "cpu"
 
 # NOTE: Use contextvars to act as a bridge between buffer and manager classes
 _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
 
 class Device(metaclass=ABCMeta):
-  _available: bool
-  _name: str
+    _available: bool
+    _name: str
+    _accessed_buffer: list
 
-  def __init__(self):
-    self._available = True
-    pass
+    def __init__(self):
+        self._available = True
+        self._accessed_buffer = []
+        pass
+
+    @abstractmethod
+    def __repr__(self):
+        pass 
 
-  @abstractmethod
-  def __repr__(self):
-    pass 
+    @abstractmethod
+    def __str__(self):
+        pass
 
 class GPU(Device):
-  
-  def __init__(self):
-    super().__init__()
-    self._name = "gpu"
-  pass
+    
+    def __init__(self):
+        super().__init__()
+        self._name = "gpu"
+    pass
 
-  def __repr__(self):
-    return self._name
+    def __repr__(self):
+        return self._name
+
+    def __str__(self):
+        return self._name
 
 @contextlib.contextmanager
 def offloading(dev: Device): 
-  # TODO: Update buffers and copy to
-  token = _current_device.set(dev)
-  try:
-    yield
-  finally:
-    # TODO: Update buffers and copy back
-    _current_device.reset(token)
-    pass
-    
+    token = _current_device.set(dev)
+    try:
+        yield
+    finally:
+        _current_device.reset(token)
+        # TODO: Update buffers and copy back
+        # NOTE: Does this have to happen lazily if we do not have a record of accessed buffers?
diff --git a/pyop3/exceptions.py b/pyop3/exceptions.py
index 22fb6f8bf7..4a8cc5f1dc 100644
--- a/pyop3/exceptions.py
+++ b/pyop3/exceptions.py
@@ -24,6 +24,9 @@ class ValueMismatchException(Pyop3Exception):
 class UnhashableObjectException(Pyop3Exception, TypeError):
     pass
 
+class UnsupportedArrayException(Pyop3Exception, TypeError):
+    pass
+
 
 # {{{ caching
 
diff --git a/pyop3/utils.py b/pyop3/utils.py
index 3fbe8778ad..b9f34d4e10 100644
--- a/pyop3/utils.py
+++ b/pyop3/utils.py
@@ -14,6 +14,7 @@
 
 import cachetools
 import numpy as np
+import cupy as cp
 import pytools
 from immutabledict import immutabledict
 from mpi4py import MPI
@@ -23,7 +24,7 @@
 from pyop3.config import config
 from pyop3.constants import PYOP3_DECIDE, _nothing
 from pyop3.dtypes import DTypeT, IntType
-from pyop3.exceptions import CommMismatchException, CommNotFoundException, Pyop3Exception, UnhashableObjectException
+from pyop3.exceptions import CommMismatchException, CommNotFoundException, Pyop3Exception, UnhashableObjectException, UnsupportedArrayException
 from pyop3.mpi import collective
 
 
@@ -331,12 +332,22 @@ def map_when(func, when_func, iterable):
             yield item
 
 
-def readonly(array):
+@functools.singledispatch
+def readonly(array: Any) -> Any:
+    raise UnsupportedArrayException 
+
+@readonly.register
+def _(array: np.ndarray) -> np.ndarray:
     """Return a readonly view of a numpy array."""
     view = array.view()
     view.setflags(write=False)
     return view
 
+@readonly.register
+def _(array: cp.ndarray) -> cp.ndarray:
+    """ Return a view of a CuPy array."""
+    view = array.view()
+    return view
 
 def debug_assert(predicate, msg=None):
     if config.debug_checks:
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 6d7ad4bb03..6e735e7126 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -36,10 +36,10 @@
 assert isinstance(g.dat.data_ro, np.ndarray)
 
 # state tracking checks, .buffer.state is now device-specific
-# assert f.dat.buffer.state[host] == 1  # modified once
-# assert f.dat.buffer.state[gpu] == 0  # untouched
-# assert g.dat.buffer.state[host] == 0  # untouched
-# assert g.dat.buffer.state[gpu] == 0  # untouched
+assert f.dat.buffer.state[host] == 1  # modified once
+assert f.dat.buffer.state[str(gpu)] == 0  # untouched
+assert g.dat.buffer.state[host] == 0  # untouched
+assert g.dat.buffer.state[str(gpu)] == 0  # untouched
 
 with op3.offloading(gpu):
     # Getting the .data attribute on the GPU should give us back a GPU array type

From 98d60103386c3b742cbdfc0688906081714c0dcc Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 29 Apr 2026 13:47:37 +0100
Subject: [PATCH 07/36] passes basic script functionality

---
 pyop3/buffer.py   | 64 ++++++++++++++++++++++++++++++++---------------
 pyop3_gpu_demo.py |  8 +++---
 2 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index d9be4a46b9..52f75c3f57 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -22,8 +22,8 @@
 from pyop3.sf import DistributedObject, NullStarForest, StarForest, local_sf
 from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly
 from pyop3.device import (
-    _current_device,
-    HOST_DEVICE
+    HOST_DEVICE,
+    _current_device
 )
 
 from ._buffer_cy import set_petsc_mat_diagonal
@@ -322,7 +322,7 @@ def dtype(self) -> np.dtype:
         return self._data.dtype
 
     def inc_state(self) -> None:
-        self._state[self.context] += 1
+        self._state[self.get_context()] += 1
 
     # NOTE: Why is this using _lazy_data instead of _data?
     # TODO: Either adjust for _lazy_data mapping or switch to _data
@@ -335,15 +335,10 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         else:
             data = np.zeros_like(self._lazy_data)
         return self.__record_init__(_name=name, _lazy_data=data)
-    
-    def sync_to_host(self):
-        self.state[pyop3.HOST_DEVICE] = cp.asnumpy(self.state["gpu"])
-        self.state[pyop3.HOST_DEVICE] = self.state["gpu"]
 
     is_nested: ClassVar[bool] = False
     
-    @property
-    def context(self) -> str:
+    def get_context(self) -> str:
         return str(_current_device.get())
 
     @property
@@ -491,19 +486,40 @@ def assemble(self) -> None:
     def leaves_valid(self) -> bool:
         return self._leaves_valid
 
-    # TODO: Need logic to recognise when to copy data to GPU
     @property
     def _data(self):
-        context = self.context
-
-        if context == "cpu":
-            if self._lazy_data[context] is None:
-                self._lazy_data[context] = np.zeros(self.shape, dtype=self.dtype)
-        elif context == "gpu":
-            if self._lazy_data[context] is None:
-                self._lazy_data[context] = cp.zeros(self.shape, dtype=self.dtype)
-        else:
-            raise RuntimeError("Offload device not supported")
+        context = self.get_context()
+
+        # TODO: This logic cannot be kept
+        '''
+            Flaw in that host and gpu modifications will be out of sync.
+            This could lead to situations where multiple copies are made before GPU
+            context has reached the same modifications as CPU.
+            Even worse if data is called without being modified - copied but no state change.
+
+            Solution: 
+                - Copy state modifications after context window
+                - If states match, check if None, else copy.
+                - Do we want to free cupy space after context window? 
+        '''
+        if context == "gpu":
+            # If GPU has modifications >= CPU and not None, don't copy.
+            if self._lazy_data[context] is None or self.state[context] < self.state[pyop3.HOST_DEVICE]:
+                self._sync_to_device()
+
+        elif context == "cpu":
+            # If CPU has modifications >= GPU and not None, don't copy.
+            if self._lazy_data[context] is None or self.state[context] < self.state["gpu"]:
+                self._sync_to_host()
+
+        # if context == "cpu":
+        #     if self._lazy_data[context] is None:
+        #         self._lazy_data[context] = np.zeros(self.shape, dtype=self.dtype)
+        # elif context == "gpu":
+        #     if self._lazy_data[context] is None:
+        #         self._lazy_data[context] = cp.zeros(self.shape, dtype=self.dtype)
+        # else:
+        #     raise RuntimeError("Offload device not supported")
 
         if self.name == "array_247_buffer":
             breakpoint()
@@ -620,7 +636,15 @@ def localize(self) -> ArrayBuffer:
     @cached_property
     def _localized(self) -> ArrayBuffer:
         return self.__record_init__(sf=None)
+    
+    # TODO: Consider whether to make these asynchronous and group with a sync
+    def _sync_to_host(self):
+        self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data["gpu"])
+        self.state[pyop3.HOST_DEVICE] = self.state["gpu"]
 
+    def _sync_to_device(self):
+        self._lazy_data["gpu"] = cp.asarray(self._lazy_data[pyop3.HOST_DEVICE])
+        self.state["gpu"] = self.state[pyop3.HOST_DEVICE]
 
 class MatBufferSpec(abc.ABC):
     pass
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 6e735e7126..3a0df520de 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -55,9 +55,9 @@
 
     # state tracking checks
     assert f.dat.buffer.state[host] == 1  # modified once
-    assert f.dat.buffer.state[gpu] == 1  # matches host
+    assert f.dat.buffer.state[str(gpu)] == 1  # matches host
     assert g.dat.buffer.state[host] == 0  # untouched
-    assert g.dat.buffer.state[gpu] == 1  # modified once
+    assert g.dat.buffer.state[str(gpu)] == 1  # modified once
 
 assert isinstance(f.dat.data_ro, np.ndarray)
 assert isinstance(g.dat.data_ro, np.ndarray)
@@ -65,6 +65,6 @@
 
 # state tracking checks
 assert f.dat.buffer.state[host] == 1  # modified once
-assert f.dat.buffer.state[gpu] == 1  # matches host
+assert f.dat.buffer.state[str(gpu)] == 1  # matches host
 assert g.dat.buffer.state[host] == 1  # matches device
-assert g.dat.buffer.state[gpu] == 1  # modified once
+assert g.dat.buffer.state[str(gpu)] == 1  # modified once

From b044c7afe834f3036a059bc9d27ad1cfd20eb3b6 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 29 Apr 2026 19:18:23 +0100
Subject: [PATCH 08/36] tofix: revised approach ensuring explicit choice of GPU
 device

---
 pyop3/__init__.py |  1 -
 pyop3/buffer.py   | 56 +++++++++++++++++++------------------------
 pyop3/device.py   | 60 +++++++++++++++++++++++++++++++++--------------
 pyop3_gpu_demo.py |  3 +--
 4 files changed, 68 insertions(+), 52 deletions(-)

diff --git a/pyop3/__init__.py b/pyop3/__init__.py
index e5590ac2d1..7ce9203b0a 100644
--- a/pyop3/__init__.py
+++ b/pyop3/__init__.py
@@ -90,7 +90,6 @@ def _init_likwid():
 from pyop3.device import ( # noqa: F401
     HOST_DEVICE,
     GPU,
-    offloading
 )
 from pyop3.sf import StarForest, single_star_sf, local_sf
 import pyop3.sf
diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 52f75c3f57..974eef9a6f 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -23,7 +23,7 @@
 from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly
 from pyop3.device import (
     HOST_DEVICE,
-    _current_device
+    Device
 )
 
 from ._buffer_cy import set_petsc_mat_diagonal
@@ -322,7 +322,7 @@ def dtype(self) -> np.dtype:
         return self._data.dtype
 
     def inc_state(self) -> None:
-        self._state[self.get_context()] += 1
+        self._state[self.get_context_string()] += 1
 
     # NOTE: Why is this using _lazy_data instead of _data?
     # TODO: Either adjust for _lazy_data mapping or switch to _data
@@ -338,8 +338,11 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
 
     is_nested: ClassVar[bool] = False
     
-    def get_context(self) -> str:
-        return str(_current_device.get())
+    def get_context(self) -> Device:
+        return Device.current()
+
+    def get_context_string(self) -> str:
+        return str(Device.current())
 
     @property
     def handle(self) -> np.ndarray:
@@ -489,41 +492,25 @@ def leaves_valid(self) -> bool:
     @property
     def _data(self):
         context = self.get_context()
+        context_str = self.get_context_string()
 
-        # TODO: This logic cannot be kept
-        '''
-            Flaw in that host and gpu modifications will be out of sync.
-            This could lead to situations where multiple copies are made before GPU
-            context has reached the same modifications as CPU.
-            Even worse if data is called without being modified - copied but no state change.
-
-            Solution: 
-                - Copy state modifications after context window
-                - If states match, check if None, else copy.
-                - Do we want to free cupy space after context window? 
-        '''
-        if context == "gpu":
-            # If GPU has modifications >= CPU and not None, don't copy.
-            if self._lazy_data[context] is None or self.state[context] < self.state[pyop3.HOST_DEVICE]:
+        if context_str == "gpu":
+            if self._lazy_data[context_str] is None or self.state[context_str] < self.state[pyop3.HOST_DEVICE]:
                 self._sync_to_device()
-
-        elif context == "cpu":
-            # If CPU has modifications >= GPU and not None, don't copy.
-            if self._lazy_data[context] is None or self.state[context] < self.state["gpu"]:
-                self._sync_to_host()
-
-        # if context == "cpu":
-        #     if self._lazy_data[context] is None:
-        #         self._lazy_data[context] = np.zeros(self.shape, dtype=self.dtype)
-        # elif context == "gpu":
-        #     if self._lazy_data[context] is None:
-        #         self._lazy_data[context] = cp.zeros(self.shape, dtype=self.dtype)
+                context.register(self)
+
+        # if context_str == "cpu":
+        #     if self._lazy_data[context_str] is None:
+        #         self._lazy_data[context_str] = np.zeros(self.shape, dtype=self.dtype)
+        # elif context_str == "gpu":
+        #     if self._lazy_data[context_str] is None:
+        #         self._lazy_data[context_str] = cp.zeros(self.shape, dtype=self.dtype)
         # else:
         #     raise RuntimeError("Offload device not supported")
 
         if self.name == "array_247_buffer":
             breakpoint()
-        return self._lazy_data[context]
+        return self._lazy_data[context_str]
 
     # TODO: I think the halo bits should only be handled at the Dat level via the
     # axis tree. Here we can just consider the array.
@@ -638,6 +625,11 @@ def _localized(self) -> ArrayBuffer:
         return self.__record_init__(sf=None)
     
     # TODO: Consider whether to make these asynchronous and group with a sync
+    def maybe_sync_to_host(self):
+        # Only sync if modified whilst on GPU
+        self.state[pyop3.HOST_DEVICE] < self.state["gpu"]:
+            self._sync_to_host()
+
     def _sync_to_host(self):
         self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data["gpu"])
         self.state[pyop3.HOST_DEVICE] = self.state["gpu"]
diff --git a/pyop3/device.py b/pyop3/device.py
index 2bbc0c0c83..b94ebf812d 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -4,20 +4,29 @@
 import contextvars
 import warnings
 
+import cupy as cp
+
+
 # TODO: The constant should be elsewhere but temporary here 
 HOST_DEVICE = "cpu"
 
-# NOTE: Use contextvars to act as a bridge between buffer and manager classes
+# NOTE: Use contextvars to act as a bridge between buffer and manager class
 _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
 
 class Device(metaclass=ABCMeta):
-    _available: bool
     _name: str
-    _accessed_buffer: list
+    _registered_arrays: set 
 
-    def __init__(self):
-        self._available = True
-        self._accessed_buffer = []
+    def __init__(self, device: int | None = None):
+        pass
+
+    @staticmethod
+    def current():
+        device = _current_device.get()
+        return device
+
+    @abstractmethod
+    def register(self, arr):
         pass
 
     @abstractmethod
@@ -28,12 +37,38 @@ def __repr__(self):
     def __str__(self):
         pass
 
+# TODO: Necessary to make this?
+class CPU(Device):
+    pass
+
+# Implementation follows similar idea to CuPY (with GPU(): ...)
 class GPU(Device):
     
-    def __init__(self):
+    def __init__(self, device: int | None = None):
         super().__init__()
         self._name = "gpu"
-    pass
+        self._registered_arrays = set()
+        self._token = None
+        self.device = cp.cuda.Device(device)
+
+    def __enter__(self):
+        self._token = _current_device.set(self)
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.sync_buffers()
+        _current_device.reset(self._token)
+        self._reset_register()
+
+    def sync_buffers(self):
+        for arr in self._registered_arrays:
+            arr.maybe_sync_to_host()
+
+    def register(self, arr):
+        self._registered_arrays.add(arr)
+
+    def _reset_register(self):
+        self._registered_arrays = set()
 
     def __repr__(self):
         return self._name
@@ -41,12 +76,3 @@ def __repr__(self):
     def __str__(self):
         return self._name
 
-@contextlib.contextmanager
-def offloading(dev: Device): 
-    token = _current_device.set(dev)
-    try:
-        yield
-    finally:
-        _current_device.reset(token)
-        # TODO: Update buffers and copy back
-        # NOTE: Does this have to happen lazily if we do not have a record of accessed buffers?
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 3a0df520de..4b830656f7 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -41,9 +41,8 @@
 assert g.dat.buffer.state[host] == 0  # untouched
 assert g.dat.buffer.state[str(gpu)] == 0  # untouched
 
-with op3.offloading(gpu):
+with gpu:
     # Getting the .data attribute on the GPU should give us back a GPU array type
-
     assert not isinstance(f.dat.data_ro, np.ndarray)
     assert not isinstance(g.dat.data_ro, np.ndarray)
 

From 1de6320b2fc98eba58d4eecc63ecdd02d9198ef2 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Thu, 30 Apr 2026 10:54:20 +0100
Subject: [PATCH 09/36] implicit transfer and defaultdict implementation
 (pub/sub eager copy model)

---
 pyop3/__init__.py |  1 +
 pyop3/buffer.py   | 81 +++++++++++++++++++++++++++--------------------
 pyop3/device.py   | 75 +++++++++++++++++++++++++++----------------
 pyop3_gpu_demo.py | 14 ++++----
 4 files changed, 102 insertions(+), 69 deletions(-)

diff --git a/pyop3/__init__.py b/pyop3/__init__.py
index 7ce9203b0a..e5590ac2d1 100644
--- a/pyop3/__init__.py
+++ b/pyop3/__init__.py
@@ -90,6 +90,7 @@ def _init_likwid():
 from pyop3.device import ( # noqa: F401
     HOST_DEVICE,
     GPU,
+    offloading
 )
 from pyop3.sf import StarForest, single_star_sf, local_sf
 import pyop3.sf
diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 974eef9a6f..88f3d7c6ed 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -22,8 +22,8 @@
 from pyop3.sf import DistributedObject, NullStarForest, StarForest, local_sf
 from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly
 from pyop3.device import (
-    HOST_DEVICE,
-    Device
+    Device,
+    HOST_DEVICE
 )
 
 from ._buffer_cy import set_petsc_mat_diagonal
@@ -239,7 +239,7 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
     _rank_equal: bool
     _ordered: bool
     # TODO: Mutable pair of variables corresponding to state for host & device  
-    _state: dict[str, int]
+    _state: dict[Device, int]
 
     _max_value: np.number | None = None
 
@@ -256,7 +256,16 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer,
 
     def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False):
 
-        data_mapping = {pyop3.HOST_DEVICE: None, "gpu": None}
+        data_mapping = {}
+        if isinstance(data, np.ndarray):
+            data_mapping[pyop3.HOST_DEVICE] = data
+        elif isinstance(data, cp.ndarray):
+            ctx = self.get_context()
+            # TODO: Raise custom exception if no Device context 
+            if not isinstance(ctx, Device):
+                raise NotImplementedError
+
+            data_mapping[ctx] = data 
 
         if sf is None:
             sf = NullStarForest(data.size)
@@ -270,11 +279,6 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         # TODO: CuPy has no support for `writeable` flag 
         if constant:
             data.flags.writeable = False
-
-        if isinstance(data, np.ndarray):
-            data_mapping[pyop3.HOST_DEVICE] = data
-        elif isinstance(data, cp.ndarray):
-            data_mapping["gpu"] = data 
             
         self._lazy_data = data_mapping 
         self.sf = sf
@@ -283,8 +287,10 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         self._rank_equal = rank_equal
         self._max_value = max_value
         self._ordered = ordered
-        # TODO: Quite ugly, maybe switch to ENUM in future.
-        self._state = {pyop3.HOST_DEVICE: 0, "gpu": 0}
+
+        # NOTE: Defaultdict approach, worth discussing with Connor 
+        # TODO: Not acknowledging if created whilst in device context
+        self._state = collections.defaultdict(int, [(pyop3.HOST_DEVICE, 0)]) 
         self.__post_init__()
 
     def __post_init__(self) -> None:
@@ -322,7 +328,7 @@ def dtype(self) -> np.dtype:
         return self._data.dtype
 
     def inc_state(self) -> None:
-        self._state[self.get_context_string()] += 1
+        self._state[self.get_context()] += 1
 
     # NOTE: Why is this using _lazy_data instead of _data?
     # TODO: Either adjust for _lazy_data mapping or switch to _data
@@ -341,9 +347,6 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
     def get_context(self) -> Device:
         return Device.current()
 
-    def get_context_string(self) -> str:
-        return str(Device.current())
-
     @property
     def handle(self) -> np.ndarray:
         return self._data
@@ -491,26 +494,24 @@ def leaves_valid(self) -> bool:
 
     @property
     def _data(self):
-        context = self.get_context()
-        context_str = self.get_context_string()
+        ctx = self.get_context()
 
-        if context_str == "gpu":
-            if self._lazy_data[context_str] is None or self.state[context_str] < self.state[pyop3.HOST_DEVICE]:
-                self._sync_to_device()
-                context.register(self)
+        # NOTE: Consequence of implicit transfers
+        if ctx not in self._lazy_data:
+            self._lazy_data[ctx] = None    
 
-        # if context_str == "cpu":
-        #     if self._lazy_data[context_str] is None:
-        #         self._lazy_data[context_str] = np.zeros(self.shape, dtype=self.dtype)
-        # elif context_str == "gpu":
-        #     if self._lazy_data[context_str] is None:
-        #         self._lazy_data[context_str] = cp.zeros(self.shape, dtype=self.dtype)
-        # else:
-        #     raise RuntimeError("Offload device not supported")
+        # TODO: Don't like explicit GPU reference
+        if ctx.name == "gpu":
+            if self._lazy_data[ctx] is None or self.state[ctx] < self.state[pyop3.HOST_DEVICE]:
+                self._sync_to_device()
+                ctx.register(self)
+        
+        # if self._lazy_data[ctx_str] is None:
+        #    self._lazy_data[ctx_str] = np.zeros(self.shape, dtype=self.dtype)
 
         if self.name == "array_247_buffer":
             breakpoint()
-        return self._lazy_data[context_str]
+        return self._lazy_data[ctx]
 
     # TODO: I think the halo bits should only be handled at the Dat level via the
     # axis tree. Here we can just consider the array.
@@ -627,16 +628,26 @@ def _localized(self) -> ArrayBuffer:
     # TODO: Consider whether to make these asynchronous and group with a sync
     def maybe_sync_to_host(self):
         # Only sync if modified whilst on GPU
-        self.state[pyop3.HOST_DEVICE] < self.state["gpu"]:
+        if self.state[pyop3.HOST_DEVICE] < self.state[self.get_context()]:
             self._sync_to_host()
 
     def _sync_to_host(self):
-        self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data["gpu"])
-        self.state[pyop3.HOST_DEVICE] = self.state["gpu"]
+        context = self.get_context()
+        # TODO: Raise exception (warning?) as user on host already
+        if context is pyop3.HOST_DEVICE:
+            raise NotImplementedError
+
+        self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data[context])
+        self.state[pyop3.HOST_DEVICE] = self.state[context]
 
     def _sync_to_device(self):
-        self._lazy_data["gpu"] = cp.asarray(self._lazy_data[pyop3.HOST_DEVICE])
-        self.state["gpu"] = self.state[pyop3.HOST_DEVICE]
+        context = self.get_context()
+        # TODO: Raise exception (warning?) as user on host already
+        if context is pyop3.HOST_DEVICE:
+            raise NotImplementedError
+
+        self._lazy_data[context] = cp.asarray(self._lazy_data[pyop3.HOST_DEVICE])
+        self.state[context] = self.state[pyop3.HOST_DEVICE]
 
 class MatBufferSpec(abc.ABC):
     pass
diff --git a/pyop3/device.py b/pyop3/device.py
index b94ebf812d..f717cea7ad 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -6,13 +6,6 @@
 
 import cupy as cp
 
-
-# TODO: The constant should be elsewhere but temporary here 
-HOST_DEVICE = "cpu"
-
-# NOTE: Use contextvars to act as a bridge between buffer and manager class
-_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
-
 class Device(metaclass=ABCMeta):
     _name: str
     _registered_arrays: set 
@@ -24,24 +17,48 @@ def __init__(self, device: int | None = None):
     def current():
         device = _current_device.get()
         return device
+    
+    @abstractmethod
+    def sync_buffers(self):
+        pass
 
     @abstractmethod
     def register(self, arr):
         pass
 
     @abstractmethod
-    def __repr__(self):
-        pass 
+    def _reset_register(self):
+        pass
 
-    @abstractmethod
+    @property
+    def name(self):
+        return self._name
+
+    def __repr__(self):
+        return self._name
+        
     def __str__(self):
-        pass
+        return self._name
 
-# TODO: Necessary to make this?
 class CPU(Device):
-    pass
 
-# Implementation follows similar idea to CuPY (with GPU(): ...)
+    def __init__(self, device: int | None = None):
+        super().__init__()
+        self._name = "cpu"
+        self._registered_arrays = set()
+        self.device = None
+
+    # NOTE: Is it necessary to have any implementation here? 
+    # Maybe perfunctory implementations but no real purpose
+    def sync_buffers(self):
+        pass
+
+    def register(self, arr):
+        self._registered_arrays.add(arr)
+
+    def _reset_register(self):
+        self._registered_arrays = set()
+
 class GPU(Device):
     
     def __init__(self, device: int | None = None):
@@ -51,15 +68,6 @@ def __init__(self, device: int | None = None):
         self._token = None
         self.device = cp.cuda.Device(device)
 
-    def __enter__(self):
-        self._token = _current_device.set(self)
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.sync_buffers()
-        _current_device.reset(self._token)
-        self._reset_register()
-
     def sync_buffers(self):
         for arr in self._registered_arrays:
             arr.maybe_sync_to_host()
@@ -70,9 +78,22 @@ def register(self, arr):
     def _reset_register(self):
         self._registered_arrays = set()
 
-    def __repr__(self):
-        return self._name
+@contextlib.contextmanager
+def offloading(device: Device):
+    # TODO: Not device exception
+    if not isinstance(device, Device):
+        raise NotImplementedError
 
-    def __str__(self):
-        return self._name
+    token = _current_device.set(device)
+    try:
+        yield
+    finally:
+        device.sync_buffers()
+        device._reset_register()
+        _current_device.reset(token)
+
+# TODO: Should this const variable be here? 
+HOST_DEVICE = CPU() 
 
+# NOTE: Use contextvars to act as a bridge between buffer and manager class
+_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 4b830656f7..6802e5fea6 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -37,11 +37,11 @@
 
 # state tracking checks, .buffer.state is now device-specific
 assert f.dat.buffer.state[host] == 1  # modified once
-assert f.dat.buffer.state[str(gpu)] == 0  # untouched
+assert f.dat.buffer.state[gpu] == 0  # untouched
 assert g.dat.buffer.state[host] == 0  # untouched
-assert g.dat.buffer.state[str(gpu)] == 0  # untouched
+assert g.dat.buffer.state[gpu] == 0  # untouched
 
-with gpu:
+with op3.offloading(gpu):
     # Getting the .data attribute on the GPU should give us back a GPU array type
     assert not isinstance(f.dat.data_ro, np.ndarray)
     assert not isinstance(g.dat.data_ro, np.ndarray)
@@ -54,9 +54,9 @@
 
     # state tracking checks
     assert f.dat.buffer.state[host] == 1  # modified once
-    assert f.dat.buffer.state[str(gpu)] == 1  # matches host
+    assert f.dat.buffer.state[gpu] == 1  # matches host
     assert g.dat.buffer.state[host] == 0  # untouched
-    assert g.dat.buffer.state[str(gpu)] == 1  # modified once
+    assert g.dat.buffer.state[gpu] == 1  # modified once
 
 assert isinstance(f.dat.data_ro, np.ndarray)
 assert isinstance(g.dat.data_ro, np.ndarray)
@@ -64,6 +64,6 @@
 
 # state tracking checks
 assert f.dat.buffer.state[host] == 1  # modified once
-assert f.dat.buffer.state[str(gpu)] == 1  # matches host
+assert f.dat.buffer.state[gpu] == 1  # matches host
 assert g.dat.buffer.state[host] == 1  # matches device
-assert g.dat.buffer.state[str(gpu)] == 1  # modified once
+assert g.dat.buffer.state[gpu] == 1  # modified once

From 6a263349f137bf6793b773e9846020f80301b341 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Thu, 30 Apr 2026 12:02:58 +0100
Subject: [PATCH 10/36] explicit check if GPU available on init

---
 pyop3/buffer.py |  5 +++--
 pyop3/device.py | 36 +++++++++++++++++++++++-------------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 88f3d7c6ed..ac984790de 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -626,8 +626,9 @@ def _localized(self) -> ArrayBuffer:
         return self.__record_init__(sf=None)
     
     # TODO: Consider whether to make these asynchronous and group with a sync
+    # NOTE: Perhaps it is easier to allow user to make it non/blocking
     def maybe_sync_to_host(self):
-        # Only sync if modified whilst on GPU
+        # NOTE: Ugly but won't throw error if context is host 
         if self.state[pyop3.HOST_DEVICE] < self.state[self.get_context()]:
             self._sync_to_host()
 
@@ -642,7 +643,7 @@ def _sync_to_host(self):
 
     def _sync_to_device(self):
         context = self.get_context()
-        # TODO: Raise exception (warning?) as user on host already
+        # TODO: Raise exception as user still on host
         if context is pyop3.HOST_DEVICE:
             raise NotImplementedError
 
diff --git a/pyop3/device.py b/pyop3/device.py
index f717cea7ad..6823d96c0f 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -4,13 +4,12 @@
 import contextvars
 import warnings
 
-import cupy as cp
-
 class Device(metaclass=ABCMeta):
     _name: str
     _registered_arrays: set 
+    _device_index: int | None
 
-    def __init__(self, device: int | None = None):
+    def __init__(self, device_index: int | None = None):
         pass
 
     @staticmethod
@@ -34,6 +33,10 @@ def _reset_register(self):
     def name(self):
         return self._name
 
+    @property
+    def device_index(self):
+        return self._device_index
+
     def __repr__(self):
         return self._name
         
@@ -42,11 +45,11 @@ def __str__(self):
 
 class CPU(Device):
 
-    def __init__(self, device: int | None = None):
+    def __init__(self, device_index: int | None = None):
         super().__init__()
         self._name = "cpu"
         self._registered_arrays = set()
-        self.device = None
+        self._device_index = device_index
 
     # NOTE: Is it necessary to have any implementation here? 
     # Maybe perfunctory implementations but no real purpose
@@ -61,12 +64,19 @@ def _reset_register(self):
 
 class GPU(Device):
     
-    def __init__(self, device: int | None = None):
+    def __init__(self, device_index: int | None = None):
         super().__init__()
         self._name = "gpu"
         self._registered_arrays = set()
         self._token = None
-        self.device = cp.cuda.Device(device)
+        self._device_index = device_index
+
+        try:
+            import cupy as cp
+            assert cp.is_available()
+        except:
+            # Raise No GPU exception
+            raise NotImplementedError 
 
     def sync_buffers(self):
         for arr in self._registered_arrays:
@@ -79,17 +89,17 @@ def _reset_register(self):
         self._registered_arrays = set()
 
 @contextlib.contextmanager
-def offloading(device: Device):
-    # TODO: Not device exception
-    if not isinstance(device, Device):
+def offloading(dev: Device):
+    # TODO: Not Device exception
+    if not isinstance(dev, Device):
         raise NotImplementedError
 
-    token = _current_device.set(device)
+    token = _current_device.set(dev)
     try:
         yield
     finally:
-        device.sync_buffers()
-        device._reset_register()
+        dev.sync_buffers()
+        dev._reset_register()
         _current_device.reset(token)
 
 # TODO: Should this const variable be here? 

From 8d1f967c49b69ed44a599b981689e87eb388be82 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Thu, 30 Apr 2026 16:38:35 +0100
Subject: [PATCH 11/36] cudagpu and fix incoming re: remove eager
 copying/register & dev syncing

---
 pyop3/buffer.py | 6 +++---
 pyop3/device.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index ac984790de..c83cc59459 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -232,14 +232,14 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
     # {{{ Instance attrs
 
     # TODO: Update to lazily allocated pair of host/device arrays
-    _lazy_data: dict[str, np.ndarray | cp.ndarray] = dataclasses.field(repr=False)
+    _lazy_data: dict[Device, np.ndarray | cp.ndarray] = dataclasses.field(repr=False)
     sf: StarForest
     _name: str
     _constant: bool
     _rank_equal: bool
     _ordered: bool
-    # TODO: Mutable pair of variables corresponding to state for host & device  
-    _state: dict[Device, int]
+    # TODO: Should this be a defaultdict?
+    _state: collections.defaultdict[Device, int]
 
     _max_value: np.number | None = None
 
diff --git a/pyop3/device.py b/pyop3/device.py
index 6823d96c0f..f12375ba21 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -75,7 +75,7 @@ def __init__(self, device_index: int | None = None):
             import cupy as cp
             assert cp.is_available()
         except:
-            # Raise No GPU exception
+            # TODO: Raise No GPU exception
             raise NotImplementedError 
 
     def sync_buffers(self):

From 9a729e916686f72fcbc5a89fd85656a070feb062 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Fri, 1 May 2026 11:03:13 +0100
Subject: [PATCH 12/36] move conversion logic to device.py

---
 pyop3/__init__.py |  2 +-
 pyop3/buffer.py   | 72 +++++++++++++++++++++++------------------------
 pyop3/device.py   | 58 ++++++++++++++------------------------
 pyop3_gpu_demo.py |  4 ++-
 4 files changed, 61 insertions(+), 75 deletions(-)

diff --git a/pyop3/__init__.py b/pyop3/__init__.py
index e5590ac2d1..e53a7360e7 100644
--- a/pyop3/__init__.py
+++ b/pyop3/__init__.py
@@ -89,7 +89,7 @@ def _init_likwid():
 )
 from pyop3.device import ( # noqa: F401
     HOST_DEVICE,
-    GPU,
+    CUDAGPU,
     offloading
 )
 from pyop3.sf import StarForest, single_star_sf, local_sf
diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index c83cc59459..051fcfe66f 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -23,6 +23,8 @@
 from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly
 from pyop3.device import (
     Device,
+    CUDAGPU,
+    CPU,
     HOST_DEVICE
 )
 
@@ -240,6 +242,7 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
     _ordered: bool
     # TODO: Should this be a defaultdict?
     _state: collections.defaultdict[Device, int]
+    _last_updated_device: Device
 
     _max_value: np.number | None = None
 
@@ -287,10 +290,11 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         self._rank_equal = rank_equal
         self._max_value = max_value
         self._ordered = ordered
+        self._last_updated_device = self.get_context()
 
-        # NOTE: Defaultdict approach, worth discussing with Connor 
+        # NOTE: Connor and I don't like defaultdict, unsure how to change it atm 
         # TODO: Not acknowledging if created whilst in device context
-        self._state = collections.defaultdict(int, [(pyop3.HOST_DEVICE, 0)]) 
+        self._state = collections.defaultdict(int, [(self.get_context(), 0)]) 
         self.__post_init__()
 
     def __post_init__(self) -> None:
@@ -328,7 +332,9 @@ def dtype(self) -> np.dtype:
         return self._data.dtype
 
     def inc_state(self) -> None:
-        self._state[self.get_context()] += 1
+        ctx = self.get_context()
+        self._state[ctx] += 1
+        self._last_updated_device = ctx
 
     # NOTE: Why is this using _lazy_data instead of _data?
     # TODO: Either adjust for _lazy_data mapping or switch to _data
@@ -496,18 +502,12 @@ def leaves_valid(self) -> bool:
     def _data(self):
         ctx = self.get_context()
 
-        # NOTE: Consequence of implicit transfers
-        if ctx not in self._lazy_data:
-            self._lazy_data[ctx] = None    
-
-        # TODO: Don't like explicit GPU reference
-        if ctx.name == "gpu":
-            if self._lazy_data[ctx] is None or self.state[ctx] < self.state[pyop3.HOST_DEVICE]:
-                self._sync_to_device()
-                ctx.register(self)
+        if not self._is_data_available(ctx) or not self._is_data_synced(ctx):
+            self.sync_devices(ctx)
         
-        # if self._lazy_data[ctx_str] is None:
-        #    self._lazy_data[ctx_str] = np.zeros(self.shape, dtype=self.dtype)
+        # NOTE: If data is None, set to zeros? 
+        # if self._lazy_data is None:
+        #    self._lazy_data = np.zeros(self.shape, dtype=self.dtype)
 
         if self.name == "array_247_buffer":
             breakpoint()
@@ -627,28 +627,28 @@ def _localized(self) -> ArrayBuffer:
     
     # TODO: Consider whether to make these asynchronous and group with a sync
     # NOTE: Perhaps it is easier to allow user to make it non/blocking
-    def maybe_sync_to_host(self):
-        # NOTE: Ugly but won't throw error if context is host 
-        if self.state[pyop3.HOST_DEVICE] < self.state[self.get_context()]:
-            self._sync_to_host()
-
-    def _sync_to_host(self):
-        context = self.get_context()
-        # TODO: Raise exception (warning?) as user on host already
-        if context is pyop3.HOST_DEVICE:
-            raise NotImplementedError
-
-        self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data[context])
-        self.state[pyop3.HOST_DEVICE] = self.state[context]
-
-    def _sync_to_device(self):
-        context = self.get_context()
-        # TODO: Raise exception as user still on host
-        if context is pyop3.HOST_DEVICE:
-            raise NotImplementedError
-
-        self._lazy_data[context] = cp.asarray(self._lazy_data[pyop3.HOST_DEVICE])
-        self.state[context] = self.state[pyop3.HOST_DEVICE]
+    def sync_devices(self, current_device: Device):
+        last_updated_device = self._last_updated_device
+
+        self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device])
+
+        # if isinstance(current_device, CUDAGPU):
+        #     self._lazy_data[current_device] = cp.asarray(self._lazy_data[last_updated_device])
+        # elif isinstance(current_device, CPU):
+        #     if isinstance(last_updated_device, CUDAGPU):
+        #         self._lazy_data[current_device] = cp.asnumpy(self._lazy_data[last_updated_device])
+        #     elif isinstance(last_updated_device, CPU):
+        #         self._lazy_data[current_device] = np.array(self._lazy_data[last_updated_device])
+        # else:
+        #     raise NotImplementedError 
+        
+        self._state[current_device] = self._state[last_updated_device]
+
+    def _is_data_available(self, device: Device):
+        return device in self._lazy_data
+
+    def _is_data_synced(self, device: Device):
+        return self.state[device] == max(self.state.values())
 
 class MatBufferSpec(abc.ABC):
     pass
diff --git a/pyop3/device.py b/pyop3/device.py
index f12375ba21..eadc74aaaf 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -4,30 +4,21 @@
 import contextvars
 import warnings
 
+import numpy as np
+
 class Device(metaclass=ABCMeta):
     _name: str
-    _registered_arrays: set 
     _device_index: int | None
 
     def __init__(self, device_index: int | None = None):
         pass
 
+    # TODO: Method should not be associated with object 
+    # NOTE: Would be weird to get current device as CPU whilst requesting on GPU
     @staticmethod
     def current():
         device = _current_device.get()
         return device
-    
-    @abstractmethod
-    def sync_buffers(self):
-        pass
-
-    @abstractmethod
-    def register(self, arr):
-        pass
-
-    @abstractmethod
-    def _reset_register(self):
-        pass
 
     @property
     def name(self):
@@ -37,6 +28,10 @@ def name(self):
     def device_index(self):
         return self._device_index
 
+    @abstractmethod
+    def asarray(self, arr):
+        pass
+
     def __repr__(self):
         return self._name
         
@@ -51,42 +46,33 @@ def __init__(self, device_index: int | None = None):
         self._registered_arrays = set()
         self._device_index = device_index
 
-    # NOTE: Is it necessary to have any implementation here? 
-    # Maybe perfunctory implementations but no real purpose
-    def sync_buffers(self):
-        pass
-
-    def register(self, arr):
-        self._registered_arrays.add(arr)
-
-    def _reset_register(self):
-        self._registered_arrays = set()
+    def asarray(self, arr):
+        # NOTE: Better logic if we switch from just NumPy/CuPy
+        if not isinstance(arr, np.ndarray):
+            import cupy as cp
+            return cp.asnumpy(arr)
+    
+        return np.array(arr)    
 
-class GPU(Device):
+class CUDAGPU(Device):
     
     def __init__(self, device_index: int | None = None):
         super().__init__()
-        self._name = "gpu"
+        self._name = "CudaGPU"
         self._registered_arrays = set()
         self._token = None
         self._device_index = device_index
 
         try:
             import cupy as cp
-            assert cp.is_available()
+            assert cp.is_available() 
         except:
             # TODO: Raise No GPU exception
             raise NotImplementedError 
 
-    def sync_buffers(self):
-        for arr in self._registered_arrays:
-            arr.maybe_sync_to_host()
-
-    def register(self, arr):
-        self._registered_arrays.add(arr)
-
-    def _reset_register(self):
-        self._registered_arrays = set()
+    def asarray(self, arr):
+        import cupy as cp
+        return cp.asarray(arr)
 
 @contextlib.contextmanager
 def offloading(dev: Device):
@@ -98,8 +84,6 @@ def offloading(dev: Device):
     try:
         yield
     finally:
-        dev.sync_buffers()
-        dev._reset_register()
         _current_device.reset(token)
 
 # TODO: Should this const variable be here? 
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 6802e5fea6..b124ba1c21 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -24,7 +24,7 @@
 
 # made up API, we need some way to identify the device
 host = op3.HOST_DEVICE  # or similar
-gpu = op3.GPU()
+gpu = op3.CUDAGPU()
 
 mesh = UnitSquareMesh(3, 3)
 V = FunctionSpace(mesh, "P", 2)
@@ -58,6 +58,8 @@
     assert g.dat.buffer.state[host] == 0  # untouched
     assert g.dat.buffer.state[gpu] == 1  # modified once
 
+# print(f"{g.dat.buffer._lazy_data=}")
+# print(f"{f.dat.buffer._lazy_data=}")
 assert isinstance(f.dat.data_ro, np.ndarray)
 assert isinstance(g.dat.data_ro, np.ndarray)
 assert (g.dat.data_ro == 23).all()

From 21bac65b4f1b31efeb27552705ff5a0190c7bed8 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Fri, 1 May 2026 12:59:29 +0100
Subject: [PATCH 13/36] managing buffer duplicate

---
 pyop3/buffer.py   | 61 +++++++++++++++++------------------------------
 pyop3/device.py   | 24 +++++++++++--------
 pyop3/utils.py    | 16 ++++++++++---
 pyop3_gpu_demo.py |  3 +++
 4 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 051fcfe66f..6190689c2e 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -11,7 +11,6 @@
 from typing import Any, ClassVar, Hashable
 
 import numpy as np
-import cupy as cp
 from mpi4py import MPI
 from petsc4py import PETSc
 
@@ -25,7 +24,8 @@
     Device,
     CUDAGPU,
     CPU,
-    HOST_DEVICE
+    HOST_DEVICE,
+    _current_device
 )
 
 from ._buffer_cy import set_petsc_mat_diagonal
@@ -259,17 +259,6 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer,
 
     def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False):
 
-        data_mapping = {}
-        if isinstance(data, np.ndarray):
-            data_mapping[pyop3.HOST_DEVICE] = data
-        elif isinstance(data, cp.ndarray):
-            ctx = self.get_context()
-            # TODO: Raise custom exception if no Device context 
-            if not isinstance(ctx, Device):
-                raise NotImplementedError
-
-            data_mapping[ctx] = data 
-
         if sf is None:
             sf = NullStarForest(data.size)
         name = utils.maybe_generate_name(name, prefix, self.DEFAULT_PREFIX)
@@ -279,10 +268,10 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         if rank_equal and not constant:
             raise ValueError
 
-        # TODO: CuPy has no support for `writeable` flag 
-        if constant:
-            data.flags.writeable = False
-            
+        ctx = self.get_context()
+        data_mapping = {}
+        data_mapping[ctx] = ctx.asarray(data)
+
         self._lazy_data = data_mapping 
         self.sf = sf
         self._name = name
@@ -293,8 +282,12 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         self._last_updated_device = self.get_context()
 
         # NOTE: Connor and I don't like defaultdict, unsure how to change it atm 
-        # TODO: Not acknowledging if created whilst in device context
-        self._state = collections.defaultdict(int, [(self.get_context(), 0)]) 
+        self._state = collections.defaultdict(int, [(ctx, 0)]) 
+
+        # TODO: CuPy has no support for `writeable` flag 
+        if constant and isinstance(self._data, np.ndarray):
+            self._data.flags.writeable = False
+
         self.__post_init__()
 
     def __post_init__(self) -> None:
@@ -303,7 +296,7 @@ def __post_init__(self) -> None:
             assert self.constant
         if self.ordered:
             utils.debug_assert(lambda: utils.is_sorted(self._lazy_data))
-        if self.constant:
+        if self.constant and isinstance(self._data, np.ndarray):
             assert not self._data.flags.writeable
 
     # }}}
@@ -336,22 +329,21 @@ def inc_state(self) -> None:
         self._state[ctx] += 1
         self._last_updated_device = ctx
 
-    # NOTE: Why is this using _lazy_data instead of _data?
     # TODO: Either adjust for _lazy_data mapping or switch to _data
     def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         # make sure that there are no pending transfers before we copy
         self.assemble()
         name = f"{self.name}_copy"
         if copy:
-            data = self._lazy_data.copy()
+            data = {obj: arr.copy() for obj, arr in self._lazy_data.items()}
         else:
-            data = np.zeros_like(self._lazy_data)
+            data = {obj: obj.zeros_like(arr) for obj, arr in self._lazy_data.items()}
         return self.__record_init__(_name=name, _lazy_data=data)
 
     is_nested: ClassVar[bool] = False
     
     def get_context(self) -> Device:
-        return Device.current()
+        return _current_device.get()
 
     @property
     def handle(self) -> np.ndarray:
@@ -625,29 +617,20 @@ def localize(self) -> ArrayBuffer:
     def _localized(self) -> ArrayBuffer:
         return self.__record_init__(sf=None)
     
-    # TODO: Consider whether to make these asynchronous and group with a sync
-    # NOTE: Perhaps it is easier to allow user to make it non/blocking
     def sync_devices(self, current_device: Device):
         last_updated_device = self._last_updated_device
 
         self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device])
-
-        # if isinstance(current_device, CUDAGPU):
-        #     self._lazy_data[current_device] = cp.asarray(self._lazy_data[last_updated_device])
-        # elif isinstance(current_device, CPU):
-        #     if isinstance(last_updated_device, CUDAGPU):
-        #         self._lazy_data[current_device] = cp.asnumpy(self._lazy_data[last_updated_device])
-        #     elif isinstance(last_updated_device, CPU):
-        #         self._lazy_data[current_device] = np.array(self._lazy_data[last_updated_device])
-        # else:
-        #     raise NotImplementedError 
-        
         self._state[current_device] = self._state[last_updated_device]
 
-    def _is_data_available(self, device: Device):
+        # NOTE: Current fix for CuPy having no `writeable support` or maintaining flags
+        if self.constant and isinstance(self._lazy_data[current_device], np.ndarray):
+            self._lazy_data[current_device].flags.writeable = False 
+
+    def _is_data_available(self, device: Device) -> bool:
         return device in self._lazy_data
 
-    def _is_data_synced(self, device: Device):
+    def _is_data_synced(self, device: Device) -> bool:
         return self.state[device] == max(self.state.values())
 
 class MatBufferSpec(abc.ABC):
diff --git a/pyop3/device.py b/pyop3/device.py
index eadc74aaaf..0ad464f350 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -13,13 +13,6 @@ class Device(metaclass=ABCMeta):
     def __init__(self, device_index: int | None = None):
         pass
 
-    # TODO: Method should not be associated with object 
-    # NOTE: Would be weird to get current device as CPU whilst requesting on GPU
-    @staticmethod
-    def current():
-        device = _current_device.get()
-        return device
-
     @property
     def name(self):
         return self._name
@@ -32,6 +25,10 @@ def device_index(self):
     def asarray(self, arr):
         pass
 
+    @abstractmethod
+    def zeros_like(self, arr):
+        pass
+
     def __repr__(self):
         return self._name
         
@@ -47,12 +44,15 @@ def __init__(self, device_index: int | None = None):
         self._device_index = device_index
 
     def asarray(self, arr):
-        # NOTE: Better logic if we switch from just NumPy/CuPy
+        # NOTE: Better logic needed if we switch from just NumPy/CuPy
         if not isinstance(arr, np.ndarray):
             import cupy as cp
             return cp.asnumpy(arr)
     
-        return np.array(arr)    
+        return np.array(arr)
+
+    def zeros_like(self, arr):
+        return np.zeros_like(arr)
 
 class CUDAGPU(Device):
     
@@ -73,6 +73,10 @@ def __init__(self, device_index: int | None = None):
     def asarray(self, arr):
         import cupy as cp
         return cp.asarray(arr)
+    
+    def zeros_like(self, arr):
+        import cupy as cp
+        return cp.zeros_like(arr)
 
 @contextlib.contextmanager
 def offloading(dev: Device):
@@ -89,5 +93,5 @@ def offloading(dev: Device):
 # TODO: Should this const variable be here? 
 HOST_DEVICE = CPU() 
 
-# NOTE: Use contextvars to act as a bridge between buffer and manager class
+# NOTE: Use contextvars to act as a bridge between buffer and manager
 _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
diff --git a/pyop3/utils.py b/pyop3/utils.py
index b9f34d4e10..7f991416af 100644
--- a/pyop3/utils.py
+++ b/pyop3/utils.py
@@ -14,7 +14,6 @@
 
 import cachetools
 import numpy as np
-import cupy as cp
 import pytools
 from immutabledict import immutabledict
 from mpi4py import MPI
@@ -27,6 +26,13 @@
 from pyop3.exceptions import CommMismatchException, CommNotFoundException, Pyop3Exception, UnhashableObjectException, UnsupportedArrayException
 from pyop3.mpi import collective
 
+ndarray_types = [np.ndarray,]
+try: 
+    import cupy as cp
+    ndarray_types = [np.ndarray, cp.ndarray]
+except ImportError:
+    pass
+
 
 class UniqueNameGenerator(pytools.UniqueNameGenerator):
     """Class for generating unique names."""
@@ -170,7 +176,7 @@ def is_sequence(item):
 
 def flatten(iterable):
     """Recursively flatten a nested iterable."""
-    if isinstance(iterable, np.ndarray):
+    if isinstance(iterable, tuple(ndarray_types)):
         return iterable.flatten()
     if not isinstance(iterable, (list, tuple)):
         return (iterable,)
@@ -587,8 +593,12 @@ def pretty_type(obj: Any) -> str:
 
 
 def safe_equals(a, b, /) -> bool:
-    if any(isinstance(x, np.ndarray) for x in [a, b]):
+    if any(isinstance(x, tuple(ndarray_types)) for x in [a, b]):
         return (a == b).all()
+    if any(isinstance(x, dict) for x in [a, b]):
+        if a.keys() != b.keys(): 
+            return False 
+        return all(safe_equals(a[k], b[k]) for k in a)
     else:
         return bool(a == b)
 
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index b124ba1c21..ec9f1c7b22 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -51,6 +51,9 @@
 
     # Do the assignment using MLIR (this is a later step)
     # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile")
+    k = Function(V)
+    k.dat.buffer.duplicate()
+    k.dat.buffer.duplicate(copy=True)
 
     # state tracking checks
     assert f.dat.buffer.state[host] == 1  # modified once

From 67fe76a1db9ece12458e752bc173ce29b9ceb87a Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Fri, 1 May 2026 13:05:09 +0100
Subject: [PATCH 14/36] cleanup unnecessary todos/notes

---
 pyop3/buffer.py | 3 +--
 pyop3/device.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 6190689c2e..94dbe80342 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -233,13 +233,13 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
 
     # {{{ Instance attrs
 
-    # TODO: Update to lazily allocated pair of host/device arrays
     _lazy_data: dict[Device, np.ndarray | cp.ndarray] = dataclasses.field(repr=False)
     sf: StarForest
     _name: str
     _constant: bool
     _rank_equal: bool
     _ordered: bool
+
     # TODO: Should this be a defaultdict?
     _state: collections.defaultdict[Device, int]
     _last_updated_device: Device
@@ -329,7 +329,6 @@ def inc_state(self) -> None:
         self._state[ctx] += 1
         self._last_updated_device = ctx
 
-    # TODO: Either adjust for _lazy_data mapping or switch to _data
     def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         # make sure that there are no pending transfers before we copy
         self.assemble()
diff --git a/pyop3/device.py b/pyop3/device.py
index 0ad464f350..d5b56f9487 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -90,7 +90,7 @@ def offloading(dev: Device):
     finally:
         _current_device.reset(token)
 
-# TODO: Should this const variable be here? 
+# NOTE: Should this const variable be here? 
 HOST_DEVICE = CPU() 
 
 # NOTE: Use contextvars to act as a bridge between buffer and manager

From 156831573bde4678c925f267624222d3181ed830 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 10:21:08 +0100
Subject: [PATCH 15/36] removing notes and cleaning

---
 pyop3/buffer.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 94dbe80342..86174c335f 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -240,7 +240,7 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
     _rank_equal: bool
     _ordered: bool
 
-    # TODO: Should this be a defaultdict?
+    # TODO: Connor and I both dislike defaultdict but I can't think of an alternative atm 
     _state: collections.defaultdict[Device, int]
     _last_updated_device: Device
 
@@ -269,8 +269,7 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
             raise ValueError
 
         ctx = self.get_context()
-        data_mapping = {}
-        data_mapping[ctx] = ctx.asarray(data)
+        data_mapping = {ctx: ctx.asarray(data)}
 
         self._lazy_data = data_mapping 
         self.sf = sf
@@ -279,9 +278,8 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         self._rank_equal = rank_equal
         self._max_value = max_value
         self._ordered = ordered
-        self._last_updated_device = self.get_context()
+        self._last_updated_device = ctx 
 
-        # NOTE: Connor and I don't like defaultdict, unsure how to change it atm 
         self._state = collections.defaultdict(int, [(ctx, 0)]) 
 
         # TODO: CuPy has no support for `writeable` flag 

From 25179949f1ccf5b30bd9788ce823e7ece9b2d2e0 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 11:02:59 +0100
Subject: [PATCH 16/36] fix: added copy to avoid weak reference

- new state object (int -> dict) leads the reassignment being a weak
  reference.
---
 firedrake/mesh.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/firedrake/mesh.py b/firedrake/mesh.py
index eb4be68b18..cea52bbcd6 100644
--- a/firedrake/mesh.py
+++ b/firedrake/mesh.py
@@ -3772,7 +3772,7 @@ def __init__(self, coordinates):
 
         self._bounding_box_coords = None
         self._spatial_index = None
-        self._saved_coordinate_dat_version = coordinates.dat.buffer.state
+        self._saved_coordinate_dat_version = coordinates.dat.buffer.state.copy()
 
         self._cache = {}
 
@@ -4009,7 +4009,7 @@ def spatial_index(self):
         # Build spatial index
         with PETSc.Log.Event("spatial_index_build"):
             self._spatial_index = spatialindex.from_regions(coords_min, coords_max)
-        self._saved_coordinate_dat_version = self.coordinates.dat.buffer.state
+        self._saved_coordinate_dat_version = self.coordinates.dat.buffer.state.copy()
         return self._spatial_index
 
     @PETSc.Log.EventDecorator()

From 9e7c6adbd9a0fe19caae62500d1f2b1195c0a76e Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 11:10:46 +0100
Subject: [PATCH 17/36] test: data_wo access works in context

---
 pyop3_gpu_demo.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index ec9f1c7b22..86dcbaad8d 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -55,20 +55,27 @@
     k.dat.buffer.duplicate()
     k.dat.buffer.duplicate(copy=True)
 
+    k.dat.data_wo[...] = 1
+
     # state tracking checks
     assert f.dat.buffer.state[host] == 1  # modified once
     assert f.dat.buffer.state[gpu] == 1  # matches host
     assert g.dat.buffer.state[host] == 0  # untouched
     assert g.dat.buffer.state[gpu] == 1  # modified once
+    assert k.dat.buffer.state[host] == 0  # modified once
+    assert k.dat.buffer.state[gpu] == 1  # modified once
 
 # print(f"{g.dat.buffer._lazy_data=}")
 # print(f"{f.dat.buffer._lazy_data=}")
 assert isinstance(f.dat.data_ro, np.ndarray)
 assert isinstance(g.dat.data_ro, np.ndarray)
 assert (g.dat.data_ro == 23).all()
+assert (k.dat.data_ro == 1).all()
 
 # state tracking checks
 assert f.dat.buffer.state[host] == 1  # modified once
 assert f.dat.buffer.state[gpu] == 1  # matches host
 assert g.dat.buffer.state[host] == 1  # matches device
 assert g.dat.buffer.state[gpu] == 1  # modified once
+assert k.dat.buffer.state[host] == 1  # modified once
+assert k.dat.buffer.state[gpu] == 1  # modified once

From cb5f28ed242f55df5f690641f4841b2542ca5657 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 11:23:05 +0100
Subject: [PATCH 18/36] add flatten from prev logic

---
 pyop3/buffer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 86174c335f..b9e4d81b5b 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -268,6 +268,7 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         if rank_equal and not constant:
             raise ValueError
 
+        data = data.flatten()
         ctx = self.get_context()
         data_mapping = {ctx: ctx.asarray(data)}
 

From 36d2b07728c5db590a3c1e6ff7f84ad3710de2b6 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 14:04:12 +0100
Subject: [PATCH 19/36] context function as global function and def state

- defaultdict gives -1 as default value if device object does not exist
---
 pyop3/buffer.py   | 27 ++++++++++++---------------
 pyop3/device.py   |  3 +++
 pyop3_gpu_demo.py |  6 +++---
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index b9e4d81b5b..4578d3d065 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -25,7 +25,7 @@
     CUDAGPU,
     CPU,
     HOST_DEVICE,
-    _current_device
+    get_current_device
 )
 
 from ._buffer_cy import set_petsc_mat_diagonal
@@ -269,8 +269,8 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
             raise ValueError
 
         data = data.flatten()
-        ctx = self.get_context()
-        data_mapping = {ctx: ctx.asarray(data)}
+        curr_dev = get_current_device()
+        data_mapping = {curr_dev: curr_dev.asarray(data)}
 
         self._lazy_data = data_mapping 
         self.sf = sf
@@ -279,9 +279,9 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         self._rank_equal = rank_equal
         self._max_value = max_value
         self._ordered = ordered
-        self._last_updated_device = ctx 
+        self._last_updated_device = curr_dev 
 
-        self._state = collections.defaultdict(int, [(ctx, 0)]) 
+        self._state = collections.defaultdict(lambda: -1, [(curr_dev, 0)]) 
 
         # TODO: CuPy has no support for `writeable` flag 
         if constant and isinstance(self._data, np.ndarray):
@@ -324,9 +324,9 @@ def dtype(self) -> np.dtype:
         return self._data.dtype
 
     def inc_state(self) -> None:
-        ctx = self.get_context()
-        self._state[ctx] += 1
-        self._last_updated_device = ctx
+        curr_dev = get_current_device() 
+        self._state[curr_dev] += 1
+        self._last_updated_device = curr_dev
 
     def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         # make sure that there are no pending transfers before we copy
@@ -340,9 +340,6 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
 
     is_nested: ClassVar[bool] = False
     
-    def get_context(self) -> Device:
-        return _current_device.get()
-
     @property
     def handle(self) -> np.ndarray:
         return self._data
@@ -490,10 +487,10 @@ def leaves_valid(self) -> bool:
 
     @property
     def _data(self):
-        ctx = self.get_context()
+        curr_dev = get_current_device() 
 
-        if not self._is_data_available(ctx) or not self._is_data_synced(ctx):
-            self.sync_devices(ctx)
+        if not self._is_data_available(curr_dev) or not self._is_data_synced(curr_dev):
+            self.sync_devices(curr_dev)
         
         # NOTE: If data is None, set to zeros? 
         # if self._lazy_data is None:
@@ -501,7 +498,7 @@ def _data(self):
 
         if self.name == "array_247_buffer":
             breakpoint()
-        return self._lazy_data[ctx]
+        return self._lazy_data[curr_dev]
 
     # TODO: I think the halo bits should only be handled at the Dat level via the
     # axis tree. Here we can just consider the array.
diff --git a/pyop3/device.py b/pyop3/device.py
index d5b56f9487..2fd0810d7f 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -95,3 +95,6 @@ def offloading(dev: Device):
 
 # NOTE: Use contextvars to act as a bridge between buffer and manager
 _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
+
+def get_current_device():
+    return _current_device.get()
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 86dcbaad8d..027c9a6957 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -37,9 +37,9 @@
 
 # state tracking checks, .buffer.state is now device-specific
 assert f.dat.buffer.state[host] == 1  # modified once
-assert f.dat.buffer.state[gpu] == 0  # untouched
+assert f.dat.buffer.state[gpu] == -1  # not created
 assert g.dat.buffer.state[host] == 0  # untouched
-assert g.dat.buffer.state[gpu] == 0  # untouched
+assert g.dat.buffer.state[gpu] == -1  # not created
 
 with op3.offloading(gpu):
     # Getting the .data attribute on the GPU should give us back a GPU array type
@@ -62,7 +62,7 @@
     assert f.dat.buffer.state[gpu] == 1  # matches host
     assert g.dat.buffer.state[host] == 0  # untouched
     assert g.dat.buffer.state[gpu] == 1  # modified once
-    assert k.dat.buffer.state[host] == 0  # modified once
+    assert k.dat.buffer.state[host] == -1 # not created 
     assert k.dat.buffer.state[gpu] == 1  # modified once
 
 # print(f"{g.dat.buffer._lazy_data=}")

From e5a010746f832019a261f9b8e679366ed2df005a Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 14:14:59 +0100
Subject: [PATCH 20/36] fix: maintaining constant array property

- constant property was lost between cupy/numpy conversions
- fixed by passing kwarg that is disregarded for cupy but used for numpy
---
 pyop3/buffer.py | 17 +++++------------
 pyop3/device.py | 17 +++++++++++------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 4578d3d065..053f739fa1 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -268,11 +268,8 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         if rank_equal and not constant:
             raise ValueError
 
-        data = data.flatten()
         curr_dev = get_current_device()
-        data_mapping = {curr_dev: curr_dev.asarray(data)}
 
-        self._lazy_data = data_mapping 
         self.sf = sf
         self._name = name
         self._constant = constant
@@ -281,11 +278,11 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         self._ordered = ordered
         self._last_updated_device = curr_dev 
 
-        self._state = collections.defaultdict(lambda: -1, [(curr_dev, 0)]) 
+        data = data.flatten()
+        data_mapping = {curr_dev: curr_dev.asarray(data, constant=self._constant)}
+        self._lazy_data = data_mapping 
 
-        # TODO: CuPy has no support for `writeable` flag 
-        if constant and isinstance(self._data, np.ndarray):
-            self._data.flags.writeable = False
+        self._state = collections.defaultdict(lambda: -1, [(curr_dev, 0)]) 
 
         self.__post_init__()
 
@@ -615,13 +612,9 @@ def _localized(self) -> ArrayBuffer:
     def sync_devices(self, current_device: Device):
         last_updated_device = self._last_updated_device
 
-        self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device])
+        self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device], constant=self.constant)
         self._state[current_device] = self._state[last_updated_device]
 
-        # NOTE: Current fix for CuPy having no `writeable support` or maintaining flags
-        if self.constant and isinstance(self._lazy_data[current_device], np.ndarray):
-            self._lazy_data[current_device].flags.writeable = False 
-
     def _is_data_available(self, device: Device) -> bool:
         return device in self._lazy_data
 
diff --git a/pyop3/device.py b/pyop3/device.py
index 2fd0810d7f..7fdf2695ca 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -22,7 +22,7 @@ def device_index(self):
         return self._device_index
 
     @abstractmethod
-    def asarray(self, arr):
+    def asarray(self, arr, *, constant=False):
         pass
 
     @abstractmethod
@@ -43,13 +43,18 @@ def __init__(self, device_index: int | None = None):
         self._registered_arrays = set()
         self._device_index = device_index
 
-    def asarray(self, arr):
+    def asarray(self, arr, *, constant=False):
         # NOTE: Better logic needed if we switch from just NumPy/CuPy
+        output = arr
         if not isinstance(arr, np.ndarray):
             import cupy as cp
-            return cp.asnumpy(arr)
-    
-        return np.array(arr)
+            output = cp.asnumpy(arr)
+        else:
+            output = np.array(output) 
+
+        if constant:
+            output.flags.writeable = False
+        return output
 
     def zeros_like(self, arr):
         return np.zeros_like(arr)
@@ -70,7 +75,7 @@ def __init__(self, device_index: int | None = None):
             # TODO: Raise No GPU exception
             raise NotImplementedError 
 
-    def asarray(self, arr):
+    def asarray(self, arr, *, constant=False):
         import cupy as cp
         return cp.asarray(arr)
     

From 39d0acd084f81b27f74ca4eee52319e263a1d310 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 14:56:21 +0100
Subject: [PATCH 21/36] pr review: removing unused variables

---
 pyop3/device.py | 39 +++++++++++----------------------------
 1 file changed, 11 insertions(+), 28 deletions(-)

diff --git a/pyop3/device.py b/pyop3/device.py
index 7fdf2695ca..64beef918d 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -7,20 +7,11 @@
 import numpy as np
 
 class Device(metaclass=ABCMeta):
-    _name: str
-    _device_index: int | None
+    name: str
 
-    def __init__(self, device_index: int | None = None):
+    def __init__(self):
         pass
 
-    @property
-    def name(self):
-        return self._name
-
-    @property
-    def device_index(self):
-        return self._device_index
-
     @abstractmethod
     def asarray(self, arr, *, constant=False):
         pass
@@ -30,18 +21,16 @@ def zeros_like(self, arr):
         pass
 
     def __repr__(self):
-        return self._name
+        return self.name
         
     def __str__(self):
-        return self._name
+        return self.name
 
 class CPU(Device):
+    name = "CPU"
 
-    def __init__(self, device_index: int | None = None):
+    def __init__(self):
         super().__init__()
-        self._name = "cpu"
-        self._registered_arrays = set()
-        self._device_index = device_index
 
     def asarray(self, arr, *, constant=False):
         # NOTE: Better logic needed if we switch from just NumPy/CuPy
@@ -60,13 +49,10 @@ def zeros_like(self, arr):
         return np.zeros_like(arr)
 
 class CUDAGPU(Device):
+    name = "CudaGPU"
     
-    def __init__(self, device_index: int | None = None):
+    def __init__(self):
         super().__init__()
-        self._name = "CudaGPU"
-        self._registered_arrays = set()
-        self._token = None
-        self._device_index = device_index
 
         try:
             import cupy as cp
@@ -83,6 +69,9 @@ def zeros_like(self, arr):
         import cupy as cp
         return cp.zeros_like(arr)
 
+HOST_DEVICE = CPU() 
+_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
+
 @contextlib.contextmanager
 def offloading(dev: Device):
     # TODO: Not Device exception
@@ -95,11 +84,5 @@ def offloading(dev: Device):
     finally:
         _current_device.reset(token)
 
-# NOTE: Should this const variable be here? 
-HOST_DEVICE = CPU() 
-
-# NOTE: Use contextvars to act as a bridge between buffer and manager
-_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
-
 def get_current_device():
     return _current_device.get()

From e48d69893d6ccb04f0d224e0f5aa5d33b348734d Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 15:07:13 +0100
Subject: [PATCH 22/36] remove dispatch to allow no-import cupy

---
 pyop3/utils.py | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/pyop3/utils.py b/pyop3/utils.py
index 7f991416af..b8008763af 100644
--- a/pyop3/utils.py
+++ b/pyop3/utils.py
@@ -337,22 +337,11 @@ def map_when(func, when_func, iterable):
         else:
             yield item
 
-
-@functools.singledispatch
-def readonly(array: Any) -> Any:
-    raise UnsupportedArrayException 
-
-@readonly.register
-def _(array: np.ndarray) -> np.ndarray:
-    """Return a readonly view of a numpy array."""
-    view = array.view()
-    view.setflags(write=False)
-    return view
-
-@readonly.register
-def _(array: cp.ndarray) -> cp.ndarray:
-    """ Return a view of a CuPy array."""
+def readonly(array: np.ndarray | cp.ndarray) -> np.ndarray | cp.ndarray:
+    """Return a readonly view of a numpy/cupy array."""
     view = array.view()
+    if isinstance(array, np.ndarray):
+        view.setflags(write=False)
     return view
 
 def debug_assert(predicate, msg=None):

From 7d582d6c55ad8b68b98e14880c8a16b721b0c4fe Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 15:17:29 +0100
Subject: [PATCH 23/36] pr: fix property, duplicate, init

- using @property for last_updated_device, known by state, does not need
  to be variable.
- duplicate method only copies most up-to-date copy and non-copy
  duplicate only copies for current device
- initialisation adds None as optional for data input
---
 pyop3/buffer.py | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 053f739fa1..dc672312bc 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -242,11 +242,8 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
 
     # TODO: Connor and I both dislike defaultdict but I can't think of an alternative atm 
     _state: collections.defaultdict[Device, int]
-    _last_updated_device: Device
-
     _max_value: np.number | None = None
 
-
     # flags for tracking parallel correctness
     _leaves_valid: bool = True
     _pending_reduction: Callable | None = None
@@ -257,7 +254,10 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer,
             type(self), self._constant, self._rank_equal, self._ordered, 
             self.dtype, buffer_counter[self])
 
-    def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False):
+    def __init__(self, data: np.ndarray | cp.ndarray | None, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False):
+
+        data = data.flatten()
+        curr_dev = get_current_device()
 
         if sf is None:
             sf = NullStarForest(data.size)
@@ -268,20 +268,13 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None,
         if rank_equal and not constant:
             raise ValueError
 
-        curr_dev = get_current_device()
-
         self.sf = sf
         self._name = name
         self._constant = constant
         self._rank_equal = rank_equal
         self._max_value = max_value
         self._ordered = ordered
-        self._last_updated_device = curr_dev 
-
-        data = data.flatten()
-        data_mapping = {curr_dev: curr_dev.asarray(data, constant=self._constant)}
-        self._lazy_data = data_mapping 
-
+        self._lazy_data = {curr_dev: curr_dev.asarray(data, constant=self._constant)}
         self._state = collections.defaultdict(lambda: -1, [(curr_dev, 0)]) 
 
         self.__post_init__()
@@ -320,19 +313,23 @@ def size(self) -> int:
     def dtype(self) -> np.dtype:
         return self._data.dtype
 
+    @property
+    def _last_updated_device(self) -> Device:
+        return max(self.state, key=self.state.get)
+
     def inc_state(self) -> None:
         curr_dev = get_current_device() 
         self._state[curr_dev] += 1
-        self._last_updated_device = curr_dev
 
     def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         # make sure that there are no pending transfers before we copy
         self.assemble()
         name = f"{self.name}_copy"
+        curr_dev = get_current_device()
         if copy:
-            data = {obj: arr.copy() for obj, arr in self._lazy_data.items()}
+            data = {curr_dev: self._lazy_data[curr_dev]}
         else:
-            data = {obj: obj.zeros_like(arr) for obj, arr in self._lazy_data.items()}
+            data = {curr_dev: curr_dev.zeros_like(self._lazy_data[curr_dev])}
         return self.__record_init__(_name=name, _lazy_data=data)
 
     is_nested: ClassVar[bool] = False
@@ -493,8 +490,6 @@ def _data(self):
         # if self._lazy_data is None:
         #    self._lazy_data = np.zeros(self.shape, dtype=self.dtype)
 
-        if self.name == "array_247_buffer":
-            breakpoint()
         return self._lazy_data[curr_dev]
 
     # TODO: I think the halo bits should only be handled at the Dat level via the

From 0528e763bcc1b180bed739f840aac61033892078 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Mon, 4 May 2026 15:48:47 +0100
Subject: [PATCH 24/36] fix: change petsc config version to v3.25.0

- v3.24.5 was failing to compile petsc4py due to PETSc no support for
  PCPatchSetComputeFunctionExteriorFacets
---
 pyproject.toml              | 2 +-
 scripts/firedrake-configure | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1e9153c5d5..7b4d9ea1e6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -160,7 +160,7 @@ requires = [
   "mpi4py; python_version < '3.13'",
   "numpy",
   # TODO RELEASE
-  # "petsc4py==3.24.5",
+  # "petsc4py==3.25.0",
   "petsctools",
   "pkgconfig",
   "pybind11",
diff --git a/scripts/firedrake-configure b/scripts/firedrake-configure
index 0c1030808d..2199dbaa87 100755
--- a/scripts/firedrake-configure
+++ b/scripts/firedrake-configure
@@ -39,7 +39,7 @@ ARCH_DEFAULT = FiredrakeArch.DEFAULT
 ARCH_COMPLEX = FiredrakeArch.COMPLEX
 
 
-SUPPORTED_PETSC_VERSION = "v3.24.5"
+SUPPORTED_PETSC_VERSION = "v3.25.0"
 
 
 def main():

From e07e6c619060a8eb5c12db97efdf34dbdd0dd22c Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 6 May 2026 11:28:41 +0100
Subject: [PATCH 25/36] include cupy callable pointer

---
 pyop3/insn/exec.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pyop3/insn/exec.py b/pyop3/insn/exec.py
index e7193dadb1..cf570ec4c5 100644
--- a/pyop3/insn/exec.py
+++ b/pyop3/insn/exec.py
@@ -515,6 +515,14 @@ def _(self, handle: int):  # assumes an address
     def _(self, handle: np.ndarray) -> int:
         return handle.ctypes.data
 
+    try:
+        import cupy as cp
+        @_as_exec_argument.register
+        def _(self, handle: cp.ndarray) -> int:
+            return handle.data.ptr
+    except ImportError:
+        pass
+
     @_as_exec_argument.register
     def _(self, mat: PETSc.Mat) -> int:
         # Sometime the matrix is in an invalid state and we cannot return a handle.

From 0aea6d0777dd0d50d99f0a717c1c0a666e4439ea Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 6 May 2026 11:54:46 +0100
Subject: [PATCH 26/36] maintain CPU SF comms and cond exec with cupy

- Wrappped all SF comms with on_host decorator to ensure happens on host
    - This increases number of copies but all MPI happening on host atm
- Executable requires receiving a pointer to array. Providing a
  conditional singledispatch register - in case user does not have cupy.
---
 pyop3/buffer.py    | 11 ++++-------
 pyop3/device.py    | 11 +++++++++++
 pyop3/insn/exec.py |  2 +-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index dc672312bc..163d230867 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -22,10 +22,8 @@
 from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly
 from pyop3.device import (
     Device,
-    CUDAGPU,
-    CPU,
-    HOST_DEVICE,
-    get_current_device
+    get_current_device,
+    on_host
 )
 
 from ._buffer_cy import set_petsc_mat_diagonal
@@ -224,9 +222,6 @@ def handle(self, *, nest_indices: tuple[tuple[int, ...], ...] = ()) -> Any:
         """The underlying data structure."""
 
 
-
-# NOTE: When GPU support is added, the host-device awareness and
-# copies should live in this class.
 @pyop3.record.record()
 class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
     """A buffer whose underlying data structure is a lazily-evaluated NumPy/CuPy array."""
@@ -520,6 +515,7 @@ def _reduction_ops(self):
         }
 
     @not_in_flight
+    @on_host
     def reduce_leaves_to_roots(self):
         self.reduce_leaves_to_roots_begin()
         self.reduce_leaves_to_roots_end()
@@ -548,6 +544,7 @@ def reduce_leaves_to_roots_end(self):
         self._finalizer = None
 
     @not_in_flight
+    @on_host
     def broadcast_roots_to_leaves(self):
         self.broadcast_roots_to_leaves_begin()
         self.broadcast_roots_to_leaves_end()
diff --git a/pyop3/device.py b/pyop3/device.py
index 64beef918d..d4b78b5879 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -84,5 +84,16 @@ def offloading(dev: Device):
     finally:
         _current_device.reset(token)
 
+def on_host(func):
+
+    def wrapper(*args, **kwargs):
+        token = _current_device.set(HOST_DEVICE)
+        try:
+            return func(*args, **kwargs)
+        finally:
+            _current_device.reset(token)
+
+    return wrapper 
+
 def get_current_device():
     return _current_device.get()
diff --git a/pyop3/insn/exec.py b/pyop3/insn/exec.py
index cf570ec4c5..64ef1d1d93 100644
--- a/pyop3/insn/exec.py
+++ b/pyop3/insn/exec.py
@@ -517,7 +517,7 @@ def _(self, handle: np.ndarray) -> int:
 
     try:
         import cupy as cp
-        @_as_exec_argument.register
+        @_as_exec_argument.register(cp.ndarray)
         def _(self, handle: cp.ndarray) -> int:
             return handle.data.ptr
     except ImportError:

From 6f64d835a7fe4ebc80844f7699c7c9b84492057b Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 6 May 2026 14:28:42 +0100
Subject: [PATCH 27/36] basic GPU unit tests covering context

---
 pyop3_gpu_demo.py                    | 16 ++---
 tests/pyop3/unit/test_gpu_context.py | 92 ++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 7 deletions(-)
 create mode 100644 tests/pyop3/unit/test_gpu_context.py

diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 027c9a6957..3b3baee6fe 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -21,6 +21,8 @@
 from firedrake import *
 import pyop3 as op3
 
+from pyop3.device import on_host
+
 
 # made up API, we need some way to identify the device
 host = op3.HOST_DEVICE  # or similar
@@ -55,18 +57,18 @@
     k.dat.buffer.duplicate()
     k.dat.buffer.duplicate(copy=True)
 
-    k.dat.data_wo[...] = 1
+    k.dat.assign(1, eager=True, eager_strategy="array")
+    
+    k.dat.data_rw[...] = 3 
 
     # state tracking checks
     assert f.dat.buffer.state[host] == 1  # modified once
     assert f.dat.buffer.state[gpu] == 1  # matches host
     assert g.dat.buffer.state[host] == 0  # untouched
     assert g.dat.buffer.state[gpu] == 1  # modified once
-    assert k.dat.buffer.state[host] == -1 # not created 
-    assert k.dat.buffer.state[gpu] == 1  # modified once
+    assert k.dat.buffer.state[host] == 2 # created for mpi sharing
+    assert k.dat.buffer.state[gpu] == 2  # modified 
 
-# print(f"{g.dat.buffer._lazy_data=}")
-# print(f"{f.dat.buffer._lazy_data=}")
 assert isinstance(f.dat.data_ro, np.ndarray)
 assert isinstance(g.dat.data_ro, np.ndarray)
 assert (g.dat.data_ro == 23).all()
@@ -77,5 +79,5 @@
 assert f.dat.buffer.state[gpu] == 1  # matches host
 assert g.dat.buffer.state[host] == 1  # matches device
 assert g.dat.buffer.state[gpu] == 1  # modified once
-assert k.dat.buffer.state[host] == 1  # modified once
-assert k.dat.buffer.state[gpu] == 1  # modified once
+assert k.dat.buffer.state[host] == 2  # modified twice
+assert k.dat.buffer.state[gpu] == 2  # modified twice 
diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py
new file mode 100644
index 0000000000..484ad81cd7
--- /dev/null
+++ b/tests/pyop3/unit/test_gpu_context.py
@@ -0,0 +1,92 @@
+import pytest
+import numpy as np
+
+try:
+    import cupy as cp
+except ImportError as err:
+    pytest.exit("CuPy not available, skipping GPU tests...")
+
+
+import pyop3 as op3
+from firedrake import Function, FunctionSpace, UnitSquareMesh
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+HOST = op3.HOST_DEVICE
+CUDAGPU = op3.CUDAGPU()
+
+STATE_NOT_CREATED = -1
+STATE_UNTOUCHED = 0
+STATE_MODIFIED = 1
+
+@pytest.fixture(scope="module")
+def mesh():
+    return UnitSquareMesh(3, 3)
+
+@pytest.fixture(scope="module")
+def function_space(mesh):
+    return FunctionSpace(mesh, "P", 2)
+
+@pytest.fixture(scope="module")
+def f(function_space):
+    return Function(function_space)
+
+@pytest.fixture(scope="module")
+def g(function_space):
+    return Function(function_space)
+
+def state(func, device):
+    """Shorthand for reading buffer state on a given device."""
+    return func.dat.buffer.state[device]
+
+class TestInitialState:
+    def test_host_data_is_numpy(self, f):
+        assert isinstance(f.dat.data_ro, np.ndarray)
+
+    def test_host_state_modified(self, f):
+        """Assign affects buffer counter on host"""
+        f.dat.assign(10, eager=True, eager_strategy="array")
+        assert state(f, HOST) == 1
+
+    def test_gpu_state_not_created(self, f):
+        """CUDAGPU buffer should not exist before any offloading."""
+        assert state(f, CUDAGPU) == STATE_NOT_CREATED
+
+class TestOffloadingArrayTypes:
+    """Inside op3.offloading, data array type should be GPU array types"""
+
+    def test_buffer_evaluates_cupy_on_cudagpu(self):
+        mesh = UnitSquareMesh(3, 3)
+        V = FunctionSpace(mesh, "P", 2)
+
+        f = Function(V).assign(10)
+        g = Function(V)
+        with op3.offloading(CUDAGPU):
+            assert not isinstance(f.dat.data_ro, np.ndarray)
+
+    def test_buffer_creation_on_cudagpu(self):
+        mesh = UnitSquareMesh(3, 3)
+        V = FunctionSpace(mesh, "P", 2)
+
+        f = Function(V).assign(10)
+        g = Function(V)
+        with op3.offloading(CUDAGPU):
+            k = Function(V)
+            assert not isinstance(k.dat.data_ro, np.ndarray)
+
+class TestOffloadingAssignmentState:
+    
+    @pytest.fixture(autouse=True)
+    def _run_gpu_assign(self, f, g):
+        with op3.offloading(CUDAGPU):
+            g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
+
+    def test_host_state_untouched_after_gpu_assign(self, g):
+        """g was not modified on host"""
+        assert state(g, HOST) == 0 
+
+    def test_g_gpu_state_modified_after_assign(self, g):
+        assert state(g, CUDAGPU) == 1

From c8db3e20e5e65c85293e0c00b189e452a088c4da Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 6 May 2026 18:07:27 +0100
Subject: [PATCH 28/36] test cases for gpu - no fixtures due to bug

---
 tests/pyop3/unit/test_gpu_context.py | 71 ++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 20 deletions(-)

diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py
index 484ad81cd7..69ab4f9752 100644
--- a/tests/pyop3/unit/test_gpu_context.py
+++ b/tests/pyop3/unit/test_gpu_context.py
@@ -11,10 +11,6 @@
 from firedrake import Function, FunctionSpace, UnitSquareMesh
 
 
-# ---------------------------------------------------------------------------
-# Fixtures
-# ---------------------------------------------------------------------------
-
 HOST = op3.HOST_DEVICE
 CUDAGPU = op3.CUDAGPU()
 
@@ -22,21 +18,21 @@
 STATE_UNTOUCHED = 0
 STATE_MODIFIED = 1
 
-@pytest.fixture(scope="module")
+@pytest.fixture()
 def mesh():
     return UnitSquareMesh(3, 3)
 
-@pytest.fixture(scope="module")
-def function_space(mesh):
+@pytest.fixture()
+def V(mesh):
     return FunctionSpace(mesh, "P", 2)
 
-@pytest.fixture(scope="module")
-def f(function_space):
-    return Function(function_space)
+@pytest.fixture()
+def f(V):
+    return Function(V)
 
-@pytest.fixture(scope="module")
-def g(function_space):
-    return Function(function_space)
+@pytest.fixture()
+def g(V):
+    return Function(V)
 
 def state(func, device):
     """Shorthand for reading buffer state on a given device."""
@@ -55,6 +51,8 @@ def test_gpu_state_not_created(self, f):
         """CUDAGPU buffer should not exist before any offloading."""
         assert state(f, CUDAGPU) == STATE_NOT_CREATED
 
+# NOTE: `pytest.fixture`s not used for Offloading GPU tests due to segfault
+# Unsure what is causing but we are leaving for now.
 class TestOffloadingArrayTypes:
     """Inside op3.offloading, data array type should be GPU array types"""
 
@@ -78,15 +76,48 @@ def test_buffer_creation_on_cudagpu(self):
             assert not isinstance(k.dat.data_ro, np.ndarray)
 
 class TestOffloadingAssignmentState:
-    
-    @pytest.fixture(autouse=True)
-    def _run_gpu_assign(self, f, g):
-        with op3.offloading(CUDAGPU):
-            g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
 
-    def test_host_state_untouched_after_gpu_assign(self, g):
+    def test_host_state_untouched_after_gpu_assign(self):
         """g was not modified on host"""
+        mesh = UnitSquareMesh(3, 3)
+        V = FunctionSpace(mesh, "P", 2)
+
+        f = Function(V).assign(10)
+        g = Function(V)
+        with op3.offloading(CUDAGPU):
+            g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
         assert state(g, HOST) == 0 
 
-    def test_g_gpu_state_modified_after_assign(self, g):
+    def test_gpu_state_modified_after_assign(self):
+        mesh = UnitSquareMesh(3, 3)
+        V = FunctionSpace(mesh, "P", 2)
+
+        f = Function(V).assign(10)
+        g = Function(V)
+        with op3.offloading(CUDAGPU):
+            g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
         assert state(g, CUDAGPU) == 1
+
+class TestOffloadingArraysUpdated:
+
+    def test_gpu_array_modified(self):
+        '''Data on GPU is updated in GPU context'''
+        mesh = UnitSquareMesh(3, 3)
+        V = FunctionSpace(mesh, "P", 2)
+
+        f = Function(V).assign(10)
+        g = Function(V)
+        with op3.offloading(CUDAGPU):
+            g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
+            assert (g.dat.data_ro == 23).all()
+
+    def test_gpu_array_modified(self):
+        ''' Data on CPU is updated when in CPU context'''
+        mesh = UnitSquareMesh(3, 3)
+        V = FunctionSpace(mesh, "P", 2)
+
+        f = Function(V).assign(10)
+        g = Function(V)
+        with op3.offloading(CUDAGPU):
+            g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
+        assert (g.dat.data_ro == 23).all()

From 7aa8425d9c096f9b70ed3e6cfa0b1611803acc2c Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 6 May 2026 18:08:37 +0100
Subject: [PATCH 29/36] fixed bug by amending record_modified state

bug regarding:
    if an array is modified as the first action when offloaded:
    1. state for new device is updated and incremented (as
       modifying)
    2. then buffer attempts to sync from most up-to-date device
    3. this may be current device as state was incremented first
    4. then device tries to copy non-existent array to itself - bad
solution:
    adapt record-modified decorator to increment state after wrapped
    function
    - simple try-finally wrapped trick
---
 pyop3/buffer.py   |  8 +++++---
 pyop3_gpu_demo.py | 11 ++++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 163d230867..6a89f6f8ad 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -61,8 +61,10 @@ def wrapper(self, *args, **kwargs):
 def record_modified(func):
     def wrapper(self, *args, **kwargs):
         assert not self.constant
-        self.inc_state()
-        return func(self, *args, **kwargs)
+        try:
+            return func(self, *args, **kwargs)
+        finally:
+            self.inc_state()
     return wrapper
 
 
@@ -314,7 +316,7 @@ def _last_updated_device(self) -> Device:
 
     def inc_state(self) -> None:
         curr_dev = get_current_device() 
-        self._state[curr_dev] += 1
+        self.state[curr_dev] = self.state.get(curr_dev, 0) + 1
 
     def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         # make sure that there are no pending transfers before we copy
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 3b3baee6fe..6414718467 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -57,8 +57,6 @@
     k.dat.buffer.duplicate()
     k.dat.buffer.duplicate(copy=True)
 
-    k.dat.assign(1, eager=True, eager_strategy="array")
-    
     k.dat.data_rw[...] = 3 
 
     # state tracking checks
@@ -66,18 +64,17 @@
     assert f.dat.buffer.state[gpu] == 1  # matches host
     assert g.dat.buffer.state[host] == 0  # untouched
     assert g.dat.buffer.state[gpu] == 1  # modified once
-    assert k.dat.buffer.state[host] == 2 # created for mpi sharing
-    assert k.dat.buffer.state[gpu] == 2  # modified 
+    assert k.dat.buffer.state[host] == -1 # not created 
+    assert k.dat.buffer.state[gpu] == 1  # modified 
 
 assert isinstance(f.dat.data_ro, np.ndarray)
 assert isinstance(g.dat.data_ro, np.ndarray)
 assert (g.dat.data_ro == 23).all()
-assert (k.dat.data_ro == 1).all()
 
 # state tracking checks
 assert f.dat.buffer.state[host] == 1  # modified once
 assert f.dat.buffer.state[gpu] == 1  # matches host
 assert g.dat.buffer.state[host] == 1  # matches device
 assert g.dat.buffer.state[gpu] == 1  # modified once
-assert k.dat.buffer.state[host] == 2  # modified twice
-assert k.dat.buffer.state[gpu] == 2  # modified twice 
+assert k.dat.buffer.state[host] == -1  # not created
+assert k.dat.buffer.state[gpu] == 1  # modified once 

From 5be4a8c549a15f3c4a39d8703c2399f73b8ff66b Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 12 May 2026 13:06:11 +0100
Subject: [PATCH 30/36] tests and fix: cached properties on gpu

basic unit tests introduced for gpu context as ground truth

bug: if AxisTree for FunctionSpace (or sim. object) not cached
until on device, it attempts to `compile` - this is not implemented for
GPU yet.

fix: this has been fixed with some patches for now, to be removed.
- `firedrake/functionspaceimpl::make_dat` is wrapped in on_host
- `pyop3/tree/axis_tree/tree.py::_buffer_indices` is wrapped in on_host
- `pyop3/buffer.py` has necessary sync in duplicate (in scenario that
  user assigns and immediately copies)

With current fix, this does mean that if the AxisTree properties are not cached,
the assign operation will happen on CPU - consequent calls work on GPU.
---
 firedrake/functionspaceimpl.py       |  3 +
 pyop3/buffer.py                      |  9 ++-
 pyop3/insn/exec.py                   |  4 +-
 pyop3/tree/axis_tree/tree.py         |  5 ++
 pyop3_gpu_demo.py                    | 11 ++--
 tests/pyop3/unit/test_gpu_context.py | 98 ++++++++++++++--------------
 6 files changed, 74 insertions(+), 56 deletions(-)

diff --git a/firedrake/functionspaceimpl.py b/firedrake/functionspaceimpl.py
index 53fd9308ee..1fe2451417 100644
--- a/firedrake/functionspaceimpl.py
+++ b/firedrake/functionspaceimpl.py
@@ -31,6 +31,7 @@
 from pyop3 import mpi
 from pyop3.utils import just_one, single_valued
 from pyop3.cache import cached_on, with_heavy_caches, cached_method
+from pyop3.device import on_host
 from finat.quadrature import QuadratureRule
 
 from ufl.cell import CellSequence
@@ -1412,7 +1413,9 @@ def dim(self):
         See also :attr:`FunctionSpace.dof_count` and :attr:`FunctionSpace.node_count` ."""
         return self.template_vec.getSize()
 
+    # TODO: `on_host` decorator only exists while `compile` strategy does not work on device
     @_with_mesh_heavy_cache
+    @on_host
     def make_dat(self, val=None, valuetype=None, name=None):
         """Return a new Dat storing DoFs for the function space."""
         if val is not None:
diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 6a89f6f8ad..18f39283cb 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -323,6 +323,12 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         self.assemble()
         name = f"{self.name}_copy"
         curr_dev = get_current_device()
+
+        # TODO: Fix for first-assign, immediate duplicate bug
+        # This can be removed once `compile` strategy works on device
+        if curr_dev not in self._lazy_data:
+            self.sync_devices(curr_dev)
+
         if copy:
             data = {curr_dev: self._lazy_data[curr_dev]}
         else:
@@ -332,7 +338,7 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
     is_nested: ClassVar[bool] = False
     
     @property
-    def handle(self) -> np.ndarray:
+    def handle(self) -> np.ndarray | cp.ndarray:
         return self._data
 
     @property
@@ -486,7 +492,6 @@ def _data(self):
         # NOTE: If data is None, set to zeros? 
         # if self._lazy_data is None:
         #    self._lazy_data = np.zeros(self.shape, dtype=self.dtype)
-
         return self._lazy_data[curr_dev]
 
     # TODO: I think the halo bits should only be handled at the Dat level via the
diff --git a/pyop3/insn/exec.py b/pyop3/insn/exec.py
index 64ef1d1d93..5514f7b158 100644
--- a/pyop3/insn/exec.py
+++ b/pyop3/insn/exec.py
@@ -517,9 +517,11 @@ def _(self, handle: np.ndarray) -> int:
 
     try:
         import cupy as cp
+        # NOTE: This gives a pointer to a GPU memory address.
+        # Loopy cannot work with GPU so this will lead to a segfault. 
         @_as_exec_argument.register(cp.ndarray)
         def _(self, handle: cp.ndarray) -> int:
-            return handle.data.ptr
+            raise MemoryError("SegFault will occur if you pass a CuPy GPU pointer to Loopy/C code")
     except ImportError:
         pass
 
diff --git a/pyop3/tree/axis_tree/tree.py b/pyop3/tree/axis_tree/tree.py
index a60dab9d76..5d5781ae8a 100644
--- a/pyop3/tree/axis_tree/tree.py
+++ b/pyop3/tree/axis_tree/tree.py
@@ -64,6 +64,8 @@
     strictly_all,
 )
 
+from pyop3.device import on_host
+
 
 if typing.TYPE_CHECKING:
     from pyop3.expr import LinearDatBufferExpression
@@ -1641,7 +1643,10 @@ def materialize(self):
 
     # TODO: how do we know if buffer_slice will produce the same object across all ranks?
     # Need to make forming a slice or a subset an active decision!
+    
+    # TODO: on_host decorator only required while `compile` strategy does not work for device offloading
     @cached_property
+    @on_host
     def _buffer_indices(self) -> np.ndarray[IntType]:
         from pyop3 import Dat, do_loop
 
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
index 6414718467..0c856e509f 100644
--- a/pyop3_gpu_demo.py
+++ b/pyop3_gpu_demo.py
@@ -48,11 +48,11 @@
     assert not isinstance(f.dat.data_ro, np.ndarray)
     assert not isinstance(g.dat.data_ro, np.ndarray)
 
-    # Do the assignment using array operations
+    # # Do the assignment using array operations
     g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="array")
 
-    # Do the assignment using MLIR (this is a later step)
-    # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile")
+    # # Do the assignment using MLIR (this is a later step)
+    # # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile")
     k = Function(V)
     k.dat.buffer.duplicate()
     k.dat.buffer.duplicate(copy=True)
@@ -64,17 +64,18 @@
     assert f.dat.buffer.state[gpu] == 1  # matches host
     assert g.dat.buffer.state[host] == 0  # untouched
     assert g.dat.buffer.state[gpu] == 1  # modified once
-    assert k.dat.buffer.state[host] == -1 # not created 
+    assert k.dat.buffer.state[host] == 0 # not modified
     assert k.dat.buffer.state[gpu] == 1  # modified 
 
 assert isinstance(f.dat.data_ro, np.ndarray)
 assert isinstance(g.dat.data_ro, np.ndarray)
 assert (g.dat.data_ro == 23).all()
+assert (k.dat.data_ro == 3).all()
 
 # state tracking checks
 assert f.dat.buffer.state[host] == 1  # modified once
 assert f.dat.buffer.state[gpu] == 1  # matches host
 assert g.dat.buffer.state[host] == 1  # matches device
 assert g.dat.buffer.state[gpu] == 1  # modified once
-assert k.dat.buffer.state[host] == -1  # not created
+assert k.dat.buffer.state[host] == 1  # matches device 
 assert k.dat.buffer.state[gpu] == 1  # modified once 
diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py
index 69ab4f9752..1f59580d07 100644
--- a/tests/pyop3/unit/test_gpu_context.py
+++ b/tests/pyop3/unit/test_gpu_context.py
@@ -23,101 +23,103 @@ def mesh():
     return UnitSquareMesh(3, 3)
 
 @pytest.fixture()
-def V(mesh):
+def FuncSpace(mesh):
     return FunctionSpace(mesh, "P", 2)
 
 @pytest.fixture()
-def f(V):
-    return Function(V)
+def f(FuncSpace):
+    return Function(FuncSpace).assign(10)
 
 @pytest.fixture()
-def g(V):
-    return Function(V)
+def g(FuncSpace):
+    return Function(FuncSpace)
 
 def state(func, device):
     """Shorthand for reading buffer state on a given device."""
     return func.dat.buffer.state[device]
 
+
 class TestInitialState:
     def test_host_data_is_numpy(self, f):
         assert isinstance(f.dat.data_ro, np.ndarray)
 
     def test_host_state_modified(self, f):
         """Assign affects buffer counter on host"""
+        old_state = state(f, HOST)
         f.dat.assign(10, eager=True, eager_strategy="array")
-        assert state(f, HOST) == 1
+        assert state(f, HOST) == old_state + 1
 
     def test_gpu_state_not_created(self, f):
         """CUDAGPU buffer should not exist before any offloading."""
         assert state(f, CUDAGPU) == STATE_NOT_CREATED
 
-# NOTE: `pytest.fixture`s not used for Offloading GPU tests due to segfault
-# Unsure what is causing but we are leaving for now.
 class TestOffloadingArrayTypes:
     """Inside op3.offloading, data array type should be GPU array types"""
 
-    def test_buffer_evaluates_cupy_on_cudagpu(self):
-        mesh = UnitSquareMesh(3, 3)
-        V = FunctionSpace(mesh, "P", 2)
-
-        f = Function(V).assign(10)
-        g = Function(V)
+    def test_buffer_evaluates_cupy_on_cudagpu(self, FuncSpace):
+        f = Function(FuncSpace).assign(10)
         with op3.offloading(CUDAGPU):
-            assert not isinstance(f.dat.data_ro, np.ndarray)
-
-    def test_buffer_creation_on_cudagpu(self):
-        mesh = UnitSquareMesh(3, 3)
-        V = FunctionSpace(mesh, "P", 2)
+            assert isinstance(f.dat.data_ro, cp.ndarray)
 
-        f = Function(V).assign(10)
-        g = Function(V)
+    def test_buffer_creation_on_cudagpu(self, FuncSpace):
         with op3.offloading(CUDAGPU):
-            k = Function(V)
-            assert not isinstance(k.dat.data_ro, np.ndarray)
+            k = Function(FuncSpace)
+            assert isinstance(k.dat.data_ro, cp.ndarray)
 
 class TestOffloadingAssignmentState:
 
-    def test_host_state_untouched_after_gpu_assign(self):
+    def test_host_state_untouched_after_gpu_assign(self, f, g):
         """g was not modified on host"""
-        mesh = UnitSquareMesh(3, 3)
-        V = FunctionSpace(mesh, "P", 2)
-
-        f = Function(V).assign(10)
-        g = Function(V)
         with op3.offloading(CUDAGPU):
             g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
         assert state(g, HOST) == 0 
 
-    def test_gpu_state_modified_after_assign(self):
-        mesh = UnitSquareMesh(3, 3)
-        V = FunctionSpace(mesh, "P", 2)
-
-        f = Function(V).assign(10)
-        g = Function(V)
+    def test_gpu_state_modified_after_assign(self, f, g):
+        """g was modified on CUDAGPU"""
         with op3.offloading(CUDAGPU):
             g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
         assert state(g, CUDAGPU) == 1
 
 class TestOffloadingArraysUpdated:
 
-    def test_gpu_array_modified(self):
+    def test_gpu_array_modified(self, g):
         '''Data on GPU is updated in GPU context'''
-        mesh = UnitSquareMesh(3, 3)
-        V = FunctionSpace(mesh, "P", 2)
-
-        f = Function(V).assign(10)
-        g = Function(V)
         with op3.offloading(CUDAGPU):
-            g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
+            g.dat.assign(23, eager=True, eager_strategy="array")
             assert (g.dat.data_ro == 23).all()
 
-    def test_gpu_array_modified(self):
+    def test_gpu_array_modified_copied_to_host(self, g):
         ''' Data on CPU is updated when in CPU context'''
-        mesh = UnitSquareMesh(3, 3)
-        V = FunctionSpace(mesh, "P", 2)
+        with op3.offloading(CUDAGPU):
+            g.dat.assign(23, eager=True, eager_strategy="array")
+        assert (g.dat.data_ro == 23).all()
 
-        f = Function(V).assign(10)
-        g = Function(V)
+    def test_gpu_data_wo_copied_to_host(self, g):
+        ''' Data on CPU is updated when in CPU context'''
         with op3.offloading(CUDAGPU):
-            g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array")
+            g.dat.data_wo[...] = 23
         assert (g.dat.data_ro == 23).all()
+
+class TestDeviceArrayDuplication:
+    
+    def test_duplicate_not_same(self, FuncSpace):
+        """Duplicate buffer is not same object"""
+        with op3.offloading(CUDAGPU):
+            k = Function(FuncSpace)
+            k_dup_buffer = k.dat.buffer.duplicate()
+            assert type(k_dup_buffer) == type(k.dat.buffer)
+            assert not k_dup_buffer is k.dat.buffer
+
+    def test_duplicate_to_device(self, FuncSpace):
+        """ Buffer maintains device context when copied"""
+        with op3.offloading(CUDAGPU):
+            k = Function(FuncSpace)
+            k_dup_buffer = k.dat.buffer.duplicate()
+            assert isinstance(k_dup_buffer.data_ro, cp.ndarray)
+
+    def test_duplicate_copy_to_device(self, FuncSpace):
+        """ Buffer maintains device context when exact copy"""
+        with op3.offloading(CUDAGPU):
+            k = Function(FuncSpace)
+            k_dup_buffer = k.dat.buffer.duplicate(copy=True)
+            assert isinstance(k_dup_buffer.data_ro, cp.ndarray)

From f13342a61f2803143bc3e1a45ae7865a59d32296 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 12 May 2026 13:27:04 +0100
Subject: [PATCH 31/36] cleaning comments

---
 pyop3/buffer.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index 18f39283cb..f1953da0d0 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -237,7 +237,6 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer):
     _rank_equal: bool
     _ordered: bool
 
-    # TODO: Connor and I both dislike defaultdict but I can't think of an alternative atm 
     _state: collections.defaultdict[Device, int]
     _max_value: np.number | None = None
 
@@ -489,9 +488,6 @@ def _data(self):
         if not self._is_data_available(curr_dev) or not self._is_data_synced(curr_dev):
             self.sync_devices(curr_dev)
         
-        # NOTE: If data is None, set to zeros? 
-        # if self._lazy_data is None:
-        #    self._lazy_data = np.zeros(self.shape, dtype=self.dtype)
         return self._lazy_data[curr_dev]
 
     # TODO: I think the halo bits should only be handled at the Dat level via the

From 5fbbb8dd030b8ad3e023f91943b9c0c7e65f8d24 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 12 May 2026 16:23:09 +0100
Subject: [PATCH 32/36] pr: resolving comments and adding docstrings

---
 pyop3/buffer.py | 28 ++++++++++++------
 pyop3/device.py | 76 +++++++++++++++++++++++++++++++++----------------
 pyop3/utils.py  |  4 +--
 3 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index f1953da0d0..d53833d4f3 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -250,7 +250,16 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer,
             type(self), self._constant, self._rank_equal, self._ordered, 
             self.dtype, buffer_counter[self])
 
-    def __init__(self, data: np.ndarray | cp.ndarray | None, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False):
+    def __init__(
+            self, data: np.ndarray | cp.ndarray | None, 
+            sf: StarForest | None = None, *, 
+            name: str|None=None,
+            prefix:str|None=None,
+            constant:bool=False, 
+            rank_equal: bool = False, 
+            max_value: numbers.Number | None=None, 
+            ordered:bool=False
+        ):
 
         data = data.flatten()
         curr_dev = get_current_device()
@@ -326,7 +335,7 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
         # TODO: Fix for first-assign, immediate duplicate bug
         # This can be removed once `compile` strategy works on device
         if curr_dev not in self._lazy_data:
-            self.sync_devices(curr_dev)
+            self.sync_devices()
 
         if copy:
             data = {curr_dev: self._lazy_data[curr_dev]}
@@ -485,8 +494,8 @@ def leaves_valid(self) -> bool:
     def _data(self):
         curr_dev = get_current_device() 
 
-        if not self._is_data_available(curr_dev) or not self._is_data_synced(curr_dev):
-            self.sync_devices(curr_dev)
+        if not self._is_data_available_and_synced(curr_dev):
+            self.sync_devices()
         
         return self._lazy_data[curr_dev]
 
@@ -604,17 +613,18 @@ def localize(self) -> ArrayBuffer:
     def _localized(self) -> ArrayBuffer:
         return self.__record_init__(sf=None)
     
-    def sync_devices(self, current_device: Device):
+    def sync_devices(self):
         last_updated_device = self._last_updated_device
+        current_device = get_current_device()
 
         self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device], constant=self.constant)
         self._state[current_device] = self._state[last_updated_device]
 
-    def _is_data_available(self, device: Device) -> bool:
-        return device in self._lazy_data
+    def _is_data_available_and_synced(self, device: Device) -> bool:
+        is_available = device in self._lazy_data
+        is_synced = self.state[device] == max(self.state.values())
+        return is_available and is_synced
 
-    def _is_data_synced(self, device: Device) -> bool:
-        return self.state[device] == max(self.state.values())
 
 class MatBufferSpec(abc.ABC):
     pass
diff --git a/pyop3/device.py b/pyop3/device.py
index d4b78b5879..c0bf3703af 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -7,11 +7,13 @@
 import numpy as np
 
 class Device(metaclass=ABCMeta):
+    """
+    Device - Abstract class
+    - Base for future GPU implementations
+    - All device-specific logic should be kept in here
+    """
     name: str
 
-    def __init__(self):
-        pass
-
     @abstractmethod
     def asarray(self, arr, *, constant=False):
         pass
@@ -27,53 +29,77 @@ def __str__(self):
         return self.name
 
 class CPU(Device):
+    """
+    CPU Class, designed to be host object.
+    - Plausible to have multiple CPUs, functionally similar to having GPU
+    """
     name = "CPU"
 
-    def __init__(self):
-        super().__init__()
-
     def asarray(self, arr, *, constant=False):
-        # NOTE: Better logic needed if we switch from just NumPy/CuPy
-        output = arr
-        if not isinstance(arr, np.ndarray):
+        """ Convert GPU/CuPy/NumPy input array to CPU-compliant NumPy array """
+        try:
             import cupy as cp
+        except ImportError:
+            cp = None
+        
+        if cp and isinstance(arr, cp.ndarray):
             output = cp.asnumpy(arr)
+        elif isinstance(arr, np.ndarray):
+            output = np.array(arr)
+            if constant:
+                output.flags.writeable = False
         else:
-            output = np.array(output) 
+            raise TypeError(f"{type(arr)} not supported.")
 
-        if constant:
-            output.flags.writeable = False
         return output
 
+
     def zeros_like(self, arr):
         return np.zeros_like(arr)
 
 class CUDAGPU(Device):
+    """ 
+    GPU class for Nvidia GPUs 
+    - All offloading will be done through CuPy
+    - Multiple instantiations will be independent of each other 
+    """
     name = "CudaGPU"
     
     def __init__(self):
-        super().__init__()
-
         try:
-            import cupy as cp
-            assert cp.is_available() 
+            assert self.cp.is_available() 
         except:
             # TODO: Raise No GPU exception
             raise NotImplementedError 
 
-    def asarray(self, arr, *, constant=False):
+    @property
+    def cp(self):
         import cupy as cp
-        return cp.asarray(arr)
+        return cp
+
+    def asarray(self, arr, *, constant=False):
+        return self.cp.asarray(arr)
     
     def zeros_like(self, arr):
-        import cupy as cp
-        return cp.zeros_like(arr)
+        return self.cp.zeros_like(arr)
 
 HOST_DEVICE = CPU() 
+
+""" 
+    Global context variable for determining device context
+    - This should not be imported to other modules - value accessed through getter
+    - All modification should be controlled via the offloading function.
+"""
 _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE)
 
 @contextlib.contextmanager
 def offloading(dev: Device):
+    """ 
+    Context Manager for offloading components to select device
+    - Controls global _current_device variable
+    - Stores former device in `token` to allow stacking of context changes
+    """
+
     # TODO: Not Device exception
     if not isinstance(dev, Device):
         raise NotImplementedError
@@ -85,13 +111,13 @@ def offloading(dev: Device):
         _current_device.reset(token)
 
 def on_host(func):
-
+    """
+    Decorator for components that we want to stay on host device
+    i.e. MPI communications/StarForest
+    """
     def wrapper(*args, **kwargs):
-        token = _current_device.set(HOST_DEVICE)
-        try:
+        with offloading(HOST_DEVICE):
             return func(*args, **kwargs)
-        finally:
-            _current_device.reset(token)
 
     return wrapper 
 
diff --git a/pyop3/utils.py b/pyop3/utils.py
index b8008763af..b9276d0c82 100644
--- a/pyop3/utils.py
+++ b/pyop3/utils.py
@@ -29,7 +29,7 @@
 ndarray_types = [np.ndarray,]
 try: 
     import cupy as cp
-    ndarray_types = [np.ndarray, cp.ndarray]
+    ndarray_types.append(cp.ndarray) 
 except ImportError:
     pass
 
@@ -584,7 +584,7 @@ def pretty_type(obj: Any) -> str:
 def safe_equals(a, b, /) -> bool:
     if any(isinstance(x, tuple(ndarray_types)) for x in [a, b]):
         return (a == b).all()
-    if any(isinstance(x, dict) for x in [a, b]):
+    if any(isinstance(x, Mapping) for x in [a, b]):
         if a.keys() != b.keys(): 
             return False 
         return all(safe_equals(a[k], b[k]) for k in a)

From 1e7755f9d8a2eb6fa65fb154c7c461040b98a129 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 12 May 2026 16:25:50 +0100
Subject: [PATCH 33/36] pr: clean tests and remove gpu demo

---
 tests/pyop3/unit/test_gpu_context.py | 33 ++++++++++++++--------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py
index 1f59580d07..51306abad1 100644
--- a/tests/pyop3/unit/test_gpu_context.py
+++ b/tests/pyop3/unit/test_gpu_context.py
@@ -4,7 +4,7 @@
 try:
     import cupy as cp
 except ImportError as err:
-    pytest.exit("CuPy not available, skipping GPU tests...")
+    pytest.skip(allow_module_level=True, reason="CuPy not available, skipping GPU tests...")
 
 
 import pyop3 as op3
@@ -23,16 +23,16 @@ def mesh():
     return UnitSquareMesh(3, 3)
 
 @pytest.fixture()
-def FuncSpace(mesh):
+def space(mesh):
     return FunctionSpace(mesh, "P", 2)
 
 @pytest.fixture()
-def f(FuncSpace):
-    return Function(FuncSpace).assign(10)
+def f(space):
+    return Function(space).assign(10)
 
 @pytest.fixture()
-def g(FuncSpace):
-    return Function(FuncSpace)
+def g(space):
+    return Function(space)
 
 def state(func, device):
     """Shorthand for reading buffer state on a given device."""
@@ -56,14 +56,14 @@ def test_gpu_state_not_created(self, f):
 class TestOffloadingArrayTypes:
     """Inside op3.offloading, data array type should be GPU array types"""
 
-    def test_buffer_evaluates_cupy_on_cudagpu(self, FuncSpace):
-        f = Function(FuncSpace).assign(10)
+    def test_buffer_evaluates_cupy_on_cudagpu(self, space):
+        f = Function(space).assign(10)
         with op3.offloading(CUDAGPU):
             assert isinstance(f.dat.data_ro, cp.ndarray)
 
-    def test_buffer_creation_on_cudagpu(self, FuncSpace):
+    def test_buffer_creation_on_cudagpu(self, space):
         with op3.offloading(CUDAGPU):
-            k = Function(FuncSpace)
+            k = Function(space)
             assert isinstance(k.dat.data_ro, cp.ndarray)
 
 class TestOffloadingAssignmentState:
@@ -102,24 +102,25 @@ def test_gpu_data_wo_copied_to_host(self, g):
 
 class TestDeviceArrayDuplication:
     
-    def test_duplicate_not_same(self, FuncSpace):
+    def test_duplicate_not_same(self, space):
         """Duplicate buffer is not same object"""
         with op3.offloading(CUDAGPU):
-            k = Function(FuncSpace)
+            k = Function(space)
             k_dup_buffer = k.dat.buffer.duplicate()
             assert type(k_dup_buffer) == type(k.dat.buffer)
             assert not k_dup_buffer is k.dat.buffer
 
-    def test_duplicate_to_device(self, FuncSpace):
+    def test_duplicate_to_device(self, space):
         """ Buffer maintains device context when copied"""
         with op3.offloading(CUDAGPU):
-            k = Function(FuncSpace)
+            k = Function(space)
             k_dup_buffer = k.dat.buffer.duplicate()
             assert isinstance(k_dup_buffer.data_ro, cp.ndarray)
 
-    def test_duplicate_copy_to_device(self, FuncSpace):
+    def test_duplicate_copy_to_device(self, space):
         """ Buffer maintains device context when exact copy"""
         with op3.offloading(CUDAGPU):
-            k = Function(FuncSpace)
+            k = Function(space)
             k_dup_buffer = k.dat.buffer.duplicate(copy=True)
             assert isinstance(k_dup_buffer.data_ro, cp.ndarray)
+            assert k_dup_buffer._data is k.dat.buffer._data

From 891e2c7ff5b2c4fd2429de8a9a1515e6a58281ba Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 12 May 2026 16:27:48 +0100
Subject: [PATCH 34/36] removing gpu demo

---
 pyop3_gpu_demo.py | 81 -----------------------------------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 pyop3_gpu_demo.py

diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
deleted file mode 100644
index 0c856e509f..0000000000
--- a/pyop3_gpu_demo.py
+++ /dev/null
@@ -1,81 +0,0 @@
-"""
-Useful links:
-
-  * https://github.com/firedrakeproject/firedrake/blob/main/.github/workflows/core.yml#L476
-
-    How to build a GPU-enabled Firedrake.
-
-  * https://github.com/firedrakeproject/firedrake/blob/connorjward/pyop3-gpu/pyop3/device.py
-
-    An implementation of the 'device' context manager. It needs a big refactor.
-
-  * https://github.com/OP2/PyOP2/pull/691/changes#diff-f8765d963b5adb1788f453e259d8cd45f29cee9670563ddb99b9fe2bba115a12
-
-    Using a wrapper type to track changes between host and device. In pyop3
-    this would be the 'ArrayBuffer' object and link into existing
-    state tracking.
-"""
-
-import numpy as np
-
-from firedrake import *
-import pyop3 as op3
-
-from pyop3.device import on_host
-
-
-# made up API, we need some way to identify the device
-host = op3.HOST_DEVICE  # or similar
-gpu = op3.CUDAGPU()
-
-mesh = UnitSquareMesh(3, 3)
-V = FunctionSpace(mesh, "P", 2)
-
-f = Function(V).assign(10)
-g = Function(V)
-
-assert isinstance(f.dat.data_ro, np.ndarray)
-assert isinstance(g.dat.data_ro, np.ndarray)
-
-# state tracking checks, .buffer.state is now device-specific
-assert f.dat.buffer.state[host] == 1  # modified once
-assert f.dat.buffer.state[gpu] == -1  # not created
-assert g.dat.buffer.state[host] == 0  # untouched
-assert g.dat.buffer.state[gpu] == -1  # not created
-
-with op3.offloading(gpu):
-    # Getting the .data attribute on the GPU should give us back a GPU array type
-    assert not isinstance(f.dat.data_ro, np.ndarray)
-    assert not isinstance(g.dat.data_ro, np.ndarray)
-
-    # # Do the assignment using array operations
-    g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="array")
-
-    # # Do the assignment using MLIR (this is a later step)
-    # # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile")
-    k = Function(V)
-    k.dat.buffer.duplicate()
-    k.dat.buffer.duplicate(copy=True)
-
-    k.dat.data_rw[...] = 3 
-
-    # state tracking checks
-    assert f.dat.buffer.state[host] == 1  # modified once
-    assert f.dat.buffer.state[gpu] == 1  # matches host
-    assert g.dat.buffer.state[host] == 0  # untouched
-    assert g.dat.buffer.state[gpu] == 1  # modified once
-    assert k.dat.buffer.state[host] == 0 # not modified
-    assert k.dat.buffer.state[gpu] == 1  # modified 
-
-assert isinstance(f.dat.data_ro, np.ndarray)
-assert isinstance(g.dat.data_ro, np.ndarray)
-assert (g.dat.data_ro == 23).all()
-assert (k.dat.data_ro == 3).all()
-
-# state tracking checks
-assert f.dat.buffer.state[host] == 1  # modified once
-assert f.dat.buffer.state[gpu] == 1  # matches host
-assert g.dat.buffer.state[host] == 1  # matches device
-assert g.dat.buffer.state[gpu] == 1  # modified once
-assert k.dat.buffer.state[host] == 1  # matches device 
-assert k.dat.buffer.state[gpu] == 1  # modified once 

From 5bd5243a8f1fb05a4015eb40d57bfb8429216a7a Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Tue, 12 May 2026 16:35:35 +0100
Subject: [PATCH 35/36] more descriptive docstring

---
 pyop3/device.py   | 18 ++++++++---
 pyop3_gpu_demo.py | 81 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 pyop3_gpu_demo.py

diff --git a/pyop3/device.py b/pyop3/device.py
index c0bf3703af..1308b31ede 100644
--- a/pyop3/device.py
+++ b/pyop3/device.py
@@ -30,7 +30,7 @@ def __str__(self):
 
 class CPU(Device):
     """
-    CPU Class, designed to be host object.
+    CPU Class, designed to be host object, inheriting Device
     - Plausible to have multiple CPUs, functionally similar to having GPU
     """
     name = "CPU"
@@ -59,7 +59,7 @@ def zeros_like(self, arr):
 
 class CUDAGPU(Device):
     """ 
-    GPU class for Nvidia GPUs 
+    GPU class for Nvidia GPUs. inheriting Device.
     - All offloading will be done through CuPy
     - Multiple instantiations will be independent of each other 
     """
@@ -96,8 +96,18 @@ def zeros_like(self, arr):
 def offloading(dev: Device):
     """ 
     Context Manager for offloading components to select device
-    - Controls global _current_device variable
-    - Stores former device in `token` to allow stacking of context changes
+    This function should be the only way to modfiy the current device variable
+
+    Updates current context to the given `dev` variable.
+    Former device is stored in stack, to be restored when finished
+        - This also allows for stacking of context windows
+
+    Context variables are also async safe.
+
+    --- Example:
+    gpu = op3.CUDAGPU()
+    with op3.offloading(gpu):
+        g.dat.assign(23, eager=True, eager_strategy="array")
     """
 
     # TODO: Not Device exception
diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py
new file mode 100644
index 0000000000..0c856e509f
--- /dev/null
+++ b/pyop3_gpu_demo.py
@@ -0,0 +1,81 @@
+"""
+Useful links:
+
+  * https://github.com/firedrakeproject/firedrake/blob/main/.github/workflows/core.yml#L476
+
+    How to build a GPU-enabled Firedrake.
+
+  * https://github.com/firedrakeproject/firedrake/blob/connorjward/pyop3-gpu/pyop3/device.py
+
+    An implementation of the 'device' context manager. It needs a big refactor.
+
+  * https://github.com/OP2/PyOP2/pull/691/changes#diff-f8765d963b5adb1788f453e259d8cd45f29cee9670563ddb99b9fe2bba115a12
+
+    Using a wrapper type to track changes between host and device. In pyop3
+    this would be the 'ArrayBuffer' object and link into existing
+    state tracking.
+"""
+
+import numpy as np
+
+from firedrake import *
+import pyop3 as op3
+
+from pyop3.device import on_host
+
+
+# made up API, we need some way to identify the device
+host = op3.HOST_DEVICE  # or similar
+gpu = op3.CUDAGPU()
+
+mesh = UnitSquareMesh(3, 3)
+V = FunctionSpace(mesh, "P", 2)
+
+f = Function(V).assign(10)
+g = Function(V)
+
+assert isinstance(f.dat.data_ro, np.ndarray)
+assert isinstance(g.dat.data_ro, np.ndarray)
+
+# state tracking checks, .buffer.state is now device-specific
+assert f.dat.buffer.state[host] == 1  # modified once
+assert f.dat.buffer.state[gpu] == -1  # not created
+assert g.dat.buffer.state[host] == 0  # untouched
+assert g.dat.buffer.state[gpu] == -1  # not created
+
+with op3.offloading(gpu):
+    # Getting the .data attribute on the GPU should give us back a GPU array type
+    assert not isinstance(f.dat.data_ro, np.ndarray)
+    assert not isinstance(g.dat.data_ro, np.ndarray)
+
+    # # Do the assignment using array operations
+    g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="array")
+
+    # # Do the assignment using MLIR (this is a later step)
+    # # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile")
+    k = Function(V)
+    k.dat.buffer.duplicate()
+    k.dat.buffer.duplicate(copy=True)
+
+    k.dat.data_rw[...] = 3 
+
+    # state tracking checks
+    assert f.dat.buffer.state[host] == 1  # modified once
+    assert f.dat.buffer.state[gpu] == 1  # matches host
+    assert g.dat.buffer.state[host] == 0  # untouched
+    assert g.dat.buffer.state[gpu] == 1  # modified once
+    assert k.dat.buffer.state[host] == 0 # not modified
+    assert k.dat.buffer.state[gpu] == 1  # modified 
+
+assert isinstance(f.dat.data_ro, np.ndarray)
+assert isinstance(g.dat.data_ro, np.ndarray)
+assert (g.dat.data_ro == 23).all()
+assert (k.dat.data_ro == 3).all()
+
+# state tracking checks
+assert f.dat.buffer.state[host] == 1  # modified once
+assert f.dat.buffer.state[gpu] == 1  # matches host
+assert g.dat.buffer.state[host] == 1  # matches device
+assert g.dat.buffer.state[gpu] == 1  # modified once
+assert k.dat.buffer.state[host] == 1  # matches device 
+assert k.dat.buffer.state[gpu] == 1  # modified once 

From 74eb11fcc0fdaf138dbf771acd8d6f2872f83f93 Mon Sep 17 00:00:00 2001
From: SamSJackson <s2020962@ed.ac.uk>
Date: Wed, 13 May 2026 10:58:58 +0100
Subject: [PATCH 36/36] pr: changes to remove _data for get_array

---
 pyop3/buffer.py                         | 54 ++++++++++++-------------
 pyop3/expr/visitors/__init__.py         |  2 +-
 pyop3/insn/exec.py                      |  5 ++-
 pyop3/tree/axis_tree/visitors/layout.py |  2 +-
 pyop3/tree/axis_tree/visitors/size.py   |  2 +-
 tests/pyop3/unit/test_gpu_context.py    |  2 +-
 6 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/pyop3/buffer.py b/pyop3/buffer.py
index d53833d4f3..37c2996dae 100644
--- a/pyop3/buffer.py
+++ b/pyop3/buffer.py
@@ -57,17 +57,6 @@ def wrapper(self, *args, **kwargs):
 
     return wrapper
 
-
-def record_modified(func):
-    def wrapper(self, *args, **kwargs):
-        assert not self.constant
-        try:
-            return func(self, *args, **kwargs)
-        finally:
-            self.inc_state()
-    return wrapper
-
-
 class AbstractBuffer(DistributedObject, metaclass=abc.ABCMeta):
 
     DEFAULT_PREFIX = "buffer"
@@ -285,13 +274,14 @@ def __init__(
         self.__post_init__()
 
     def __post_init__(self) -> None:
+        curr_dev = get_current_device()
         assert self.sf.size == self.size
         if self.rank_equal:
             assert self.constant
         if self.ordered:
             utils.debug_assert(lambda: utils.is_sorted(self._lazy_data))
-        if self.constant and isinstance(self._data, np.ndarray):
-            assert not self._data.flags.writeable
+        if self.constant and isinstance(self._lazy_data[curr_dev], np.ndarray):
+            assert not self._lazy_data[curr_dev].flags.writeable
 
     # }}}
 
@@ -312,11 +302,11 @@ def __post_init__(self) -> None:
 
     @property
     def size(self) -> int:
-        return self._data.size
+        return self.get_array().size
 
     @property
     def dtype(self) -> np.dtype:
-        return self._data.dtype
+        return self.get_array().dtype
 
     @property
     def _last_updated_device(self) -> Device:
@@ -347,7 +337,7 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer:
     
     @property
     def handle(self) -> np.ndarray | cp.ndarray:
-        return self._data
+        return self.get_array()
 
     @property
     def comm(self) -> MPI.Comm:
@@ -441,7 +431,6 @@ def data(self):
         return self.data_rw
 
     @property
-    @record_modified
     @not_in_flight
     def data_rw(self):
         if not self._roots_valid:
@@ -451,7 +440,7 @@ def data_rw(self):
 
         # modifying owned values invalidates ghosts
         self._leaves_valid = False
-        return self._data
+        return self.get_array("rw")
 
     # TODO: It would be good to be able to get data_ro but without updating the halos
     # The issue with the previous approach is we would only return the owned data. This
@@ -464,10 +453,9 @@ def data_ro(self):
             self.reduce_leaves_to_roots()
         if not self._leaves_valid:
             self.broadcast_roots_to_leaves()
-        return readonly(self._data)
+        return readonly(self.get_array("ro"))
 
     @property
-    @record_modified
     @not_in_flight
     def data_wo(self):
         """
@@ -480,7 +468,7 @@ def data_wo(self):
         # pending writes can be dropped
         self._pending_reduction = None
         self._leaves_valid = False
-        return self._data
+        return self.get_array("wo")
 
     @not_in_flight
     def assemble(self) -> None:
@@ -490,13 +478,15 @@ def assemble(self) -> None:
     def leaves_valid(self) -> bool:
         return self._leaves_valid
 
-    @property
-    def _data(self):
+    def get_array(self, intent: Literal["ro", "rw", "wo"] = "ro"):
         curr_dev = get_current_device() 
 
         if not self._is_data_available_and_synced(curr_dev):
             self.sync_devices()
         
+        if intent in {"wo", "rw"}: 
+            self.inc_state() 
+            
         return self._lazy_data[curr_dev]
 
     # TODO: I think the halo bits should only be handled at the Dat level via the
@@ -534,14 +524,16 @@ def reduce_leaves_to_roots(self):
 
     @not_in_flight
     def reduce_leaves_to_roots_begin(self):
+        curr_dev = get_current_device()
         if not self._roots_valid:
             self.sf.reduce_begin(
-                self._data, self._reduction_ops[self._pending_reduction]
+                self._lazy_data[curr_dev], self._reduction_ops[self._pending_reduction]
             )
             self._leaves_valid = False
         self._finalizer = self.reduce_leaves_to_roots_end
 
     def reduce_leaves_to_roots_end(self):
+        curr_dev = get_current_device()
         if self._finalizer is None:
             raise BadOrderingException(
                 "Should not call _reduce_leaves_to_roots_end without first calling "
@@ -551,7 +543,7 @@ def reduce_leaves_to_roots_end(self):
             raise DataTransferInFlightException("Wrong finalizer called")
 
         if not self._roots_valid:
-            self.sf.reduce_end(self._data, self._reduction_ops[self._pending_reduction])
+            self.sf.reduce_end(self._lazy_data[curr_dev], self._reduction_ops[self._pending_reduction])
         self._pending_reduction = None
         self._finalizer = None
 
@@ -563,14 +555,16 @@ def broadcast_roots_to_leaves(self):
 
     @not_in_flight
     def broadcast_roots_to_leaves_begin(self):
+        curr_dev = get_current_device()
         if not self._roots_valid:
             raise RuntimeError("Cannot broadcast invalid roots")
 
         if not self._leaves_valid:
-            self.sf.broadcast_begin(self._data, MPI.REPLACE)
+            self.sf.broadcast_begin(self._lazy_data[curr_dev], MPI.REPLACE)
         object.__setattr__(self, "_finalizer", self.broadcast_roots_to_leaves_end)
 
     def broadcast_roots_to_leaves_end(self):
+        curr_dev = get_current_device()
         if self._finalizer is None:
             raise BadOrderingException(
                 "Should not call _broadcast_roots_to_leaves_end without first "
@@ -580,7 +574,7 @@ def broadcast_roots_to_leaves_end(self):
             raise DataTransferInFlightException("Wrong finalizer called")
 
         if not self._leaves_valid:
-            self.sf.broadcast_end(self._data, MPI.REPLACE)
+            self.sf.broadcast_end(self._lazy_data[curr_dev], MPI.REPLACE)
         self._leaves_valid = True
         self._finalizer = None
 
@@ -617,7 +611,11 @@ def sync_devices(self):
         last_updated_device = self._last_updated_device
         current_device = get_current_device()
 
-        self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device], constant=self.constant)
+        self._lazy_data[current_device] = current_device.asarray(
+            self._lazy_data[last_updated_device], 
+            constant=self.constant
+        )
+        
         self._state[current_device] = self._state[last_updated_device]
 
     def _is_data_available_and_synced(self, device: Device) -> bool:
diff --git a/pyop3/expr/visitors/__init__.py b/pyop3/expr/visitors/__init__.py
index 38d528eb1e..b61ac0a098 100644
--- a/pyop3/expr/visitors/__init__.py
+++ b/pyop3/expr/visitors/__init__.py
@@ -1112,7 +1112,7 @@ def get_extremum(expr, extremum: Literal["max", "min"]) -> numbers.Number:
         result.assign(fn(result, expr)),
         eager=True
     )
-    return utils.just_one(result.buffer._data)
+    return utils.just_one(result.buffer.get_array())
 
 
 def max_(a, b, /, *, lazy: bool = False) -> pyop3.expr.Conditional | numbers.Number:
diff --git a/pyop3/insn/exec.py b/pyop3/insn/exec.py
index 5514f7b158..77533121bb 100644
--- a/pyop3/insn/exec.py
+++ b/pyop3/insn/exec.py
@@ -473,7 +473,7 @@ def _buffer_str(self, buffer):
 
     @_buffer_str.register
     def _(self, buffer: pyop3.buffer.ArrayBuffer):
-        return f"({buffer.size})", str(buffer._data)
+        return f"({buffer.size})", str(buffer.get_array())
 
     @_buffer_str.register
     def _(self, buffer: pyop3.buffer.PetscMatBuffer) -> str:
@@ -610,7 +610,8 @@ def _buffer_exchanges(self, buffer, intent):
                     nil = dtype_limits(buffer.dtype).min
 
                 def _init_nil():
-                    buffer._data[buffer.sf.ileaf] = nil
+                    # Not modifying owned values so don't want to update state via intent
+                    buffer.get_array()[buffer.sf.ileaf] = nil
 
                 reductions.append(_init_nil)
 
diff --git a/pyop3/tree/axis_tree/visitors/layout.py b/pyop3/tree/axis_tree/visitors/layout.py
index c9525b5eb0..4383bd59ce 100644
--- a/pyop3/tree/axis_tree/visitors/layout.py
+++ b/pyop3/tree/axis_tree/visitors/layout.py
@@ -234,7 +234,7 @@ def _compute_layouts(axis_tree: AxisTree) -> idict[ConcretePathT, ExpressionT]:
     # Lastly 'freeze' the offset dats so they can no longer be modified
     for _, offset_dat in to_tabulate:
         object.__setattr__(offset_dat.buffer, "_constant", True)
-        offset_dat.buffer._data.flags.writeable = False
+        offset_dat.buffer.get_array().flags.writeable = False
 
     return layouts
 
diff --git a/pyop3/tree/axis_tree/visitors/size.py b/pyop3/tree/axis_tree/visitors/size.py
index 68dfa34700..70742ec773 100644
--- a/pyop3/tree/axis_tree/visitors/size.py
+++ b/pyop3/tree/axis_tree/visitors/size.py
@@ -137,7 +137,7 @@ def compute_axis_tree_component_size(axis_tree: AbstractAxisTree, path: PathT, c
         if component_size_axes is UNIT_AXIS_TREE:
             # ick way to make sure that if we have sizes wrapped up into Scalars that this
             # gets passed up
-            mysize = utils.just_one(component_size.buffer._data)
+            mysize = utils.just_one(component_size.buffer.get_array())
             if not isinstance(subtree_size, numbers.Integral):
                 sbuf = ArrayBuffer.from_scalar(mysize, constant=True)
                 mysize = ScalarBufferExpression(sbuf)
diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py
index 51306abad1..472185d470 100644
--- a/tests/pyop3/unit/test_gpu_context.py
+++ b/tests/pyop3/unit/test_gpu_context.py
@@ -123,4 +123,4 @@ def test_duplicate_copy_to_device(self, space):
             k = Function(space)
             k_dup_buffer = k.dat.buffer.duplicate(copy=True)
             assert isinstance(k_dup_buffer.data_ro, cp.ndarray)
-            assert k_dup_buffer._data is k.dat.buffer._data
+            assert k_dup_buffer.get_array() is k.dat.buffer.get_array()