From 8ceb83038965fef4399718ea0b4abc3b21e84261 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 27 Apr 2026 12:39:40 +0100 Subject: [PATCH 01/36] introduce device.py and set up branch --- pyop3/device.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 pyop3/device.py diff --git a/pyop3/device.py b/pyop3/device.py new file mode 100644 index 0000000000..45048e3968 --- /dev/null +++ b/pyop3/device.py @@ -0,0 +1 @@ +# File to handle op3.device context manager From ba6de9b3cef1ad8f017664d3a651c04131f83632 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 28 Apr 2026 09:19:49 +0100 Subject: [PATCH 02/36] const parameter for host device --- pyop3/__init__.py | 4 ++++ pyop3/device.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/pyop3/__init__.py b/pyop3/__init__.py index 1cfc849fba..45ac9ebef1 100644 --- a/pyop3/__init__.py +++ b/pyop3/__init__.py @@ -87,6 +87,10 @@ def _init_likwid(): exscan, AssignmentType, ) +from pyop3.device import ( # noqa: F401 + HOST_DEVICE, + Device +) from pyop3.sf import StarForest, single_star_sf, local_sf import pyop3.sf from pyop3.tree.index_tree.parse import as_index_forest diff --git a/pyop3/device.py b/pyop3/device.py index 45048e3968..69ad5d7b46 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -1 +1,11 @@ # File to handle op3.device context manager +import abc +import contextlib +import warnings + +# TODO: Consider if there is approach to actually identify host method calling the python script +# For now, we just have a constant argument. +HOST_DEVICE = "cpu" + +class Device(abc.ABC): + pass From 6a9fd213cd8d84aeb65a8f45b6c749570ebaa4be Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 28 Apr 2026 12:16:28 +0100 Subject: [PATCH 03/36] include gpu demo and update petsc version as 3.24.5 misaligned for PCPatch*ExteriorFacets --- pyop3_gpu_demo.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 pyop3_gpu_demo.py diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py new file mode 100644 index 0000000000..9a67fdfa22 --- /dev/null +++ b/pyop3_gpu_demo.py @@ -0,0 +1,69 @@ +""" +Useful links: + + * https://github.com/firedrakeproject/firedrake/blob/main/.github/workflows/core.yml#L476 + + How to build a GPU-enabled Firedrake. + + * https://github.com/firedrakeproject/firedrake/blob/connorjward/pyop3-gpu/pyop3/device.py + + An implementation of the 'device' context manager. It needs a big refactor. + + * https://github.com/OP2/PyOP2/pull/691/changes#diff-f8765d963b5adb1788f453e259d8cd45f29cee9670563ddb99b9fe2bba115a12 + + Using a wrapper type to track changes between host and device. In pyop3 + this would be the 'ArrayBuffer' object and link into existing + state tracking. +""" + +import numpy as np + +from firedrake import * +import pyop3 as op3 + + +# made up API, we need some way to identify the device +host = op3.HOST_DEVICE # or similar +gpu = op3.Device() + +mesh = UnitSquareMesh(3, 3) +V = FunctionSpace(mesh, "P", 2) + +f = Function(V).assign(10) +g = Function(V) + +assert isinstance(f.dat.data_ro, np.ndarray) +assert isinstance(g.dat.data_ro, np.ndarray) + +# state tracking checks, .buffer.state is now device-specific +assert f.dat.buffer.state[host] == 1 # modified once +assert f.dat.buffer.state[gpu] == 0 # untouched +assert g.dat.buffer.state[host] == 0 # untouched +assert g.dat.buffer.state[gpu] == 0 # untouched + +with op3.device(gpu): + # Getting the .data attribute on the GPU should give us back a GPU array type + assert not isinstance(f.dat.data_ro, np.ndarray) + assert not isinstance(g.dat.data_ro, np.ndarray) + + # Do the assignment using array operations + g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="array") + + # Do the assignment using MLIR (this is a later step) + # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile") + + # state tracking checks + assert f.dat.buffer.state[host] == 1 # modified once + assert f.dat.buffer.state[gpu] == 1 # matches host + assert g.dat.buffer.state[host] == 0 # untouched + assert g.dat.buffer.state[gpu] == 1 # modified once + +assert isinstance(f.dat.data_ro, np.ndarray) +assert isinstance(g.dat.data_ro, np.ndarray) +assert (g.dat.data_ro == 23).all() + +# state tracking checks +assert f.dat.buffer.state[host] == 1 # modified once +assert f.dat.buffer.state[gpu] == 1 # matches host +assert g.dat.buffer.state[host] == 1 # matches device +assert g.dat.buffer.state[gpu] == 1 # modified once diff --git a/pyproject.toml b/pyproject.toml index 3b2a1003e4..1e9153c5d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "numpy", "packaging", # TODO RELEASE - # "petsc4py==3.24.5", + # "petsc4py==3.25.0", # UNDO ME "petsctools @ git+https://github.com/firedrakeproject/petsctools.git@connorjward/cpetsc", "pkgconfig", From fb3d0b7885f09ecc118a70f8cd854d952bd31036 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 28 Apr 2026 13:57:52 +0100 Subject: [PATCH 04/36] noting areas for change --- pyop3/buffer.py | 8 +++++++- pyop3/device.py | 26 +++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index e03d964f32..c1f0fb3160 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -226,16 +226,21 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): # {{{ Instance attrs + # TODO: Update to lazily allocated pair of host/device arrays _lazy_data: np.ndarray = dataclasses.field(repr=False) sf: StarForest _name: str _constant: bool _rank_equal: bool _ordered: bool + # TODO: Device awareness variable - subscriber/observer? + _awareness: int _max_value: np.number | None = None - _state: int = 0 + # TODO: Mutable pair of variables corresponding to state for host & device + # Not done as dataclass requires immutable or default factory but latter cannot be attr + _state: int = 0 # flags for tracking parallel correctness _leaves_valid: bool = True @@ -464,6 +469,7 @@ def assemble(self) -> None: def leaves_valid(self) -> bool: return self._leaves_valid + # TODO: Update this to provide CPU/GPU array as per awareness @property def _data(self): if self._lazy_data is None: diff --git a/pyop3/device.py b/pyop3/device.py index 69ad5d7b46..363e2e7bf3 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -1,11 +1,27 @@ # File to handle op3.device context manager -import abc +from abc import ABCMeta, abstractmethod import contextlib import warnings -# TODO: Consider if there is approach to actually identify host method calling the python script -# For now, we just have a constant argument. -HOST_DEVICE = "cpu" +# TODO: The constant should be elsewhere but temporary here +HOST_DEVICE = 0 -class Device(abc.ABC): +class Device(metaclass=ABCMeta): + _available: bool + _name: str + + def __init__(self): + pass + +class GPU(Device): pass + +@contextlib.contextmanager +def device(dev: Device): + # TODO: Update buffers and copy to + try: + yield "not implemented" + finally: + # TODO: Update buffers and copy back + pass + From e39f2150e5d8fa0af8677f707f8c94e6e824b0bc Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 28 Apr 2026 16:41:33 +0100 Subject: [PATCH 05/36] introducing context variable and lazy cupy --- pyop3/__init__.py | 3 ++- pyop3/buffer.py | 26 ++++++++++++++++++++------ pyop3/device.py | 24 +++++++++++++++++++++--- pyop3_gpu_demo.py | 13 +++++++------ 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/pyop3/__init__.py b/pyop3/__init__.py index 45ac9ebef1..e5590ac2d1 100644 --- a/pyop3/__init__.py +++ b/pyop3/__init__.py @@ -89,7 +89,8 @@ def _init_likwid(): ) from pyop3.device import ( # noqa: F401 HOST_DEVICE, - Device + GPU, + offloading ) from pyop3.sf import StarForest, single_star_sf, local_sf import pyop3.sf diff --git a/pyop3/buffer.py b/pyop3/buffer.py index c1f0fb3160..4197e4ea81 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -11,6 +11,7 @@ from typing import Any, ClassVar, Hashable import numpy as np +import cupy as cp from mpi4py import MPI from petsc4py import PETSc @@ -20,6 +21,7 @@ from pyop3.dtypes import IntType, ScalarType, DTypeT from pyop3.sf import DistributedObject, NullStarForest, StarForest, local_sf from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly +from pyop3.device import _current_device from ._buffer_cy import set_petsc_mat_diagonal @@ -227,7 +229,7 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): # {{{ Instance attrs # TODO: Update to lazily allocated pair of host/device arrays - _lazy_data: np.ndarray = dataclasses.field(repr=False) + _lazy_data: np.ndarray | cp.ndarray = dataclasses.field(repr=False) sf: StarForest _name: str _constant: bool @@ -472,11 +474,23 @@ def leaves_valid(self) -> bool: # TODO: Update this to provide CPU/GPU array as per awareness @property def _data(self): - if self._lazy_data is None: - self._lazy_data = np.zeros(self.shape, dtype=self.dtype) - if self.name == "array_247_buffer": - breakpoint() - return self._lazy_data + # NOTE: Testing context manager + _location = _current_device.get() + + if _location == "cpu": + if self._lazy_data is None: + self._lazy_data = np.zeros(self.shape, dtype=self.dtype) + if self.name == "array_247_buffer": + breakpoint() + return self._lazy_data + elif _location == "gpu": + if self._lazy_data is None: + self._lazy_data = cp.zeros(self.shape, dtype=self.dtype) + if self.name == "array_247_buffer": + breakpoint() + return cp.asarray(self._lazy_data) + else: + raise RuntimeError("Offload device not supported") # TODO: I think the halo bits should only be handled at the Dat level via the # axis tree. Here we can just consider the array. diff --git a/pyop3/device.py b/pyop3/device.py index 363e2e7bf3..73429073e1 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -1,27 +1,45 @@ # File to handle op3.device context manager from abc import ABCMeta, abstractmethod import contextlib +import contextvars import warnings # TODO: The constant should be elsewhere but temporary here -HOST_DEVICE = 0 +HOST_DEVICE = "cpu" + +# NOTE: Use contextvars to act as a bridge between buffer and manager classes +_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) class Device(metaclass=ABCMeta): _available: bool _name: str def __init__(self): + self._available = True pass + @abstractmethod + def __repr__(self): + pass + class GPU(Device): + + def __init__(self): + super().__init__() + self._name = "gpu" pass + def __repr__(self): + return self._name + @contextlib.contextmanager -def device(dev: Device): +def offloading(dev: Device): # TODO: Update buffers and copy to + token = _current_device.set(dev) try: - yield "not implemented" + yield finally: # TODO: Update buffers and copy back + _current_device.reset(token) pass diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 9a67fdfa22..6d7ad4bb03 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -24,7 +24,7 @@ # made up API, we need some way to identify the device host = op3.HOST_DEVICE # or similar -gpu = op3.Device() +gpu = op3.GPU() mesh = UnitSquareMesh(3, 3) V = FunctionSpace(mesh, "P", 2) @@ -36,13 +36,14 @@ assert isinstance(g.dat.data_ro, np.ndarray) # state tracking checks, .buffer.state is now device-specific -assert f.dat.buffer.state[host] == 1 # modified once -assert f.dat.buffer.state[gpu] == 0 # untouched -assert g.dat.buffer.state[host] == 0 # untouched -assert g.dat.buffer.state[gpu] == 0 # untouched +# assert f.dat.buffer.state[host] == 1 # modified once +# assert f.dat.buffer.state[gpu] == 0 # untouched +# assert g.dat.buffer.state[host] == 0 # untouched +# assert g.dat.buffer.state[gpu] == 0 # untouched -with op3.device(gpu): +with op3.offloading(gpu): # Getting the .data attribute on the GPU should give us back a GPU array type + assert not isinstance(f.dat.data_ro, np.ndarray) assert not isinstance(g.dat.data_ro, np.ndarray) From 2a54e848fc4e5aa19bf0cea255b7451720558bda Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 29 Apr 2026 12:23:20 +0100 Subject: [PATCH 06/36] lazy evaluation of arrays --- pyop3/buffer.py | 75 +++++++++++++++++++++++++++------------------ pyop3/device.py | 57 +++++++++++++++++++--------------- pyop3/exceptions.py | 3 ++ pyop3/utils.py | 15 +++++++-- pyop3_gpu_demo.py | 8 ++--- 5 files changed, 98 insertions(+), 60 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 4197e4ea81..d9be4a46b9 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -21,7 +21,10 @@ from pyop3.dtypes import IntType, ScalarType, DTypeT from pyop3.sf import DistributedObject, NullStarForest, StarForest, local_sf from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly -from pyop3.device import _current_device +from pyop3.device import ( + _current_device, + HOST_DEVICE +) from ._buffer_cy import set_petsc_mat_diagonal @@ -224,25 +227,22 @@ def handle(self, *, nest_indices: tuple[tuple[int, ...], ...] = ()) -> Any: # copies should live in this class. @pyop3.record.record() class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): - """A buffer whose underlying data structure is a numpy array.""" + """A buffer whose underlying data structure is a lazily-evaluated NumPy/CuPy array.""" # {{{ Instance attrs # TODO: Update to lazily allocated pair of host/device arrays - _lazy_data: np.ndarray | cp.ndarray = dataclasses.field(repr=False) + _lazy_data: dict[str, np.ndarray | cp.ndarray] = dataclasses.field(repr=False) sf: StarForest _name: str _constant: bool _rank_equal: bool _ordered: bool - # TODO: Device awareness variable - subscriber/observer? - _awareness: int + # TODO: Mutable pair of variables corresponding to state for host & device + _state: dict[str, int] _max_value: np.number | None = None - # TODO: Mutable pair of variables corresponding to state for host & device - # Not done as dataclass requires immutable or default factory but latter cannot be attr - _state: int = 0 # flags for tracking parallel correctness _leaves_valid: bool = True @@ -254,8 +254,10 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer, type(self), self._constant, self._rank_equal, self._ordered, self.dtype, buffer_counter[self]) - def __init__(self, data: np.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False): - data = data.flatten() + def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False): + + data_mapping = {pyop3.HOST_DEVICE: None, "gpu": None} + if sf is None: sf = NullStarForest(data.size) name = utils.maybe_generate_name(name, prefix, self.DEFAULT_PREFIX) @@ -265,16 +267,24 @@ def __init__(self, data: np.ndarray, sf: StarForest | None = None, *, name: str| if rank_equal and not constant: raise ValueError + # TODO: CuPy has no support for `writeable` flag if constant: data.flags.writeable = False - self._lazy_data = data + if isinstance(data, np.ndarray): + data_mapping[pyop3.HOST_DEVICE] = data + elif isinstance(data, cp.ndarray): + data_mapping["gpu"] = data + + self._lazy_data = data_mapping self.sf = sf self._name = name self._constant = constant self._rank_equal = rank_equal self._max_value = max_value self._ordered = ordered + # TODO: Quite ugly, maybe switch to ENUM in future. + self._state = {pyop3.HOST_DEVICE: 0, "gpu": 0} self.__post_init__() def __post_init__(self) -> None: @@ -312,8 +322,10 @@ def dtype(self) -> np.dtype: return self._data.dtype def inc_state(self) -> None: - self._state += 1 + self._state[self.context] += 1 + # NOTE: Why is this using _lazy_data instead of _data? + # TODO: Either adjust for _lazy_data mapping or switch to _data def duplicate(self, *, copy: bool = False) -> ArrayBuffer: # make sure that there are no pending transfers before we copy self.assemble() @@ -323,8 +335,16 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: else: data = np.zeros_like(self._lazy_data) return self.__record_init__(_name=name, _lazy_data=data) + + def sync_to_host(self): + self.state[pyop3.HOST_DEVICE] = cp.asnumpy(self.state["gpu"]) + self.state[pyop3.HOST_DEVICE] = self.state["gpu"] is_nested: ClassVar[bool] = False + + @property + def context(self) -> str: + return str(_current_device.get()) @property def handle(self) -> np.ndarray: @@ -471,26 +491,23 @@ def assemble(self) -> None: def leaves_valid(self) -> bool: return self._leaves_valid - # TODO: Update this to provide CPU/GPU array as per awareness + # TODO: Need logic to recognise when to copy data to GPU @property def _data(self): - # NOTE: Testing context manager - _location = _current_device.get() - - if _location == "cpu": - if self._lazy_data is None: - self._lazy_data = np.zeros(self.shape, dtype=self.dtype) - if self.name == "array_247_buffer": - breakpoint() - return self._lazy_data - elif _location == "gpu": - if self._lazy_data is None: - self._lazy_data = cp.zeros(self.shape, dtype=self.dtype) - if self.name == "array_247_buffer": - breakpoint() - return cp.asarray(self._lazy_data) + context = self.context + + if context == "cpu": + if self._lazy_data[context] is None: + self._lazy_data[context] = np.zeros(self.shape, dtype=self.dtype) + elif context == "gpu": + if self._lazy_data[context] is None: + self._lazy_data[context] = cp.zeros(self.shape, dtype=self.dtype) else: - raise RuntimeError("Offload device not supported") + raise RuntimeError("Offload device not supported") + + if self.name == "array_247_buffer": + breakpoint() + return self._lazy_data[context] # TODO: I think the halo bits should only be handled at the Dat level via the # axis tree. Here we can just consider the array. diff --git a/pyop3/device.py b/pyop3/device.py index 73429073e1..2bbc0c0c83 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -5,41 +5,48 @@ import warnings # TODO: The constant should be elsewhere but temporary here -HOST_DEVICE = "cpu" +HOST_DEVICE = "cpu" # NOTE: Use contextvars to act as a bridge between buffer and manager classes _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) class Device(metaclass=ABCMeta): - _available: bool - _name: str + _available: bool + _name: str + _accessed_buffer: list - def __init__(self): - self._available = True - pass + def __init__(self): + self._available = True + self._accessed_buffer = [] + pass + + @abstractmethod + def __repr__(self): + pass - @abstractmethod - def __repr__(self): - pass + @abstractmethod + def __str__(self): + pass class GPU(Device): - - def __init__(self): - super().__init__() - self._name = "gpu" - pass + + def __init__(self): + super().__init__() + self._name = "gpu" + pass - def __repr__(self): - return self._name + def __repr__(self): + return self._name + + def __str__(self): + return self._name @contextlib.contextmanager def offloading(dev: Device): - # TODO: Update buffers and copy to - token = _current_device.set(dev) - try: - yield - finally: - # TODO: Update buffers and copy back - _current_device.reset(token) - pass - + token = _current_device.set(dev) + try: + yield + finally: + _current_device.reset(token) + # TODO: Update buffers and copy back + # NOTE: Does this have to happen lazily if we do not have a record of accessed buffers? diff --git a/pyop3/exceptions.py b/pyop3/exceptions.py index 22fb6f8bf7..4a8cc5f1dc 100644 --- a/pyop3/exceptions.py +++ b/pyop3/exceptions.py @@ -24,6 +24,9 @@ class ValueMismatchException(Pyop3Exception): class UnhashableObjectException(Pyop3Exception, TypeError): pass +class UnsupportedArrayException(Pyop3Exception, TypeError): + pass + # {{{ caching diff --git a/pyop3/utils.py b/pyop3/utils.py index 3fbe8778ad..b9f34d4e10 100644 --- a/pyop3/utils.py +++ b/pyop3/utils.py @@ -14,6 +14,7 @@ import cachetools import numpy as np +import cupy as cp import pytools from immutabledict import immutabledict from mpi4py import MPI @@ -23,7 +24,7 @@ from pyop3.config import config from pyop3.constants import PYOP3_DECIDE, _nothing from pyop3.dtypes import DTypeT, IntType -from pyop3.exceptions import CommMismatchException, CommNotFoundException, Pyop3Exception, UnhashableObjectException +from pyop3.exceptions import CommMismatchException, CommNotFoundException, Pyop3Exception, UnhashableObjectException, UnsupportedArrayException from pyop3.mpi import collective @@ -331,12 +332,22 @@ def map_when(func, when_func, iterable): yield item -def readonly(array): +@functools.singledispatch +def readonly(array: Any) -> Any: + raise UnsupportedArrayException + +@readonly.register +def _(array: np.ndarray) -> np.ndarray: """Return a readonly view of a numpy array.""" view = array.view() view.setflags(write=False) return view +@readonly.register +def _(array: cp.ndarray) -> cp.ndarray: + """ Return a view of a CuPy array.""" + view = array.view() + return view def debug_assert(predicate, msg=None): if config.debug_checks: diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 6d7ad4bb03..6e735e7126 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -36,10 +36,10 @@ assert isinstance(g.dat.data_ro, np.ndarray) # state tracking checks, .buffer.state is now device-specific -# assert f.dat.buffer.state[host] == 1 # modified once -# assert f.dat.buffer.state[gpu] == 0 # untouched -# assert g.dat.buffer.state[host] == 0 # untouched -# assert g.dat.buffer.state[gpu] == 0 # untouched +assert f.dat.buffer.state[host] == 1 # modified once +assert f.dat.buffer.state[str(gpu)] == 0 # untouched +assert g.dat.buffer.state[host] == 0 # untouched +assert g.dat.buffer.state[str(gpu)] == 0 # untouched with op3.offloading(gpu): # Getting the .data attribute on the GPU should give us back a GPU array type From 98d60103386c3b742cbdfc0688906081714c0dcc Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 29 Apr 2026 13:47:37 +0100 Subject: [PATCH 07/36] passes basic script functionality --- pyop3/buffer.py | 64 ++++++++++++++++++++++++++++++++--------------- pyop3_gpu_demo.py | 8 +++--- 2 files changed, 48 insertions(+), 24 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index d9be4a46b9..52f75c3f57 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -22,8 +22,8 @@ from pyop3.sf import DistributedObject, NullStarForest, StarForest, local_sf from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly from pyop3.device import ( - _current_device, - HOST_DEVICE + HOST_DEVICE, + _current_device ) from ._buffer_cy import set_petsc_mat_diagonal @@ -322,7 +322,7 @@ def dtype(self) -> np.dtype: return self._data.dtype def inc_state(self) -> None: - self._state[self.context] += 1 + self._state[self.get_context()] += 1 # NOTE: Why is this using _lazy_data instead of _data? # TODO: Either adjust for _lazy_data mapping or switch to _data @@ -335,15 +335,10 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: else: data = np.zeros_like(self._lazy_data) return self.__record_init__(_name=name, _lazy_data=data) - - def sync_to_host(self): - self.state[pyop3.HOST_DEVICE] = cp.asnumpy(self.state["gpu"]) - self.state[pyop3.HOST_DEVICE] = self.state["gpu"] is_nested: ClassVar[bool] = False - @property - def context(self) -> str: + def get_context(self) -> str: return str(_current_device.get()) @property @@ -491,19 +486,40 @@ def assemble(self) -> None: def leaves_valid(self) -> bool: return self._leaves_valid - # TODO: Need logic to recognise when to copy data to GPU @property def _data(self): - context = self.context - - if context == "cpu": - if self._lazy_data[context] is None: - self._lazy_data[context] = np.zeros(self.shape, dtype=self.dtype) - elif context == "gpu": - if self._lazy_data[context] is None: - self._lazy_data[context] = cp.zeros(self.shape, dtype=self.dtype) - else: - raise RuntimeError("Offload device not supported") + context = self.get_context() + + # TODO: This logic cannot be kept + ''' + Flaw in that host and gpu modifications will be out of sync. + This could lead to situations where multiple copies are made before GPU + context has reached the same modifications as CPU. + Even worse if data is called without being modified - copied but no state change. + + Solution: + - Copy state modifications after context window + - If states match, check if None, else copy. + - Do we want to free cupy space after context window? + ''' + if context == "gpu": + # If GPU has modifications >= CPU and not None, don't copy. + if self._lazy_data[context] is None or self.state[context] < self.state[pyop3.HOST_DEVICE]: + self._sync_to_device() + + elif context == "cpu": + # If CPU has modifications >= GPU and not None, don't copy. + if self._lazy_data[context] is None or self.state[context] < self.state["gpu"]: + self._sync_to_host() + + # if context == "cpu": + # if self._lazy_data[context] is None: + # self._lazy_data[context] = np.zeros(self.shape, dtype=self.dtype) + # elif context == "gpu": + # if self._lazy_data[context] is None: + # self._lazy_data[context] = cp.zeros(self.shape, dtype=self.dtype) + # else: + # raise RuntimeError("Offload device not supported") if self.name == "array_247_buffer": breakpoint() @@ -620,7 +636,15 @@ def localize(self) -> ArrayBuffer: @cached_property def _localized(self) -> ArrayBuffer: return self.__record_init__(sf=None) + + # TODO: Consider whether to make these asynchronous and group with a sync + def _sync_to_host(self): + self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data["gpu"]) + self.state[pyop3.HOST_DEVICE] = self.state["gpu"] + def _sync_to_device(self): + self._lazy_data["gpu"] = cp.asarray(self._lazy_data[pyop3.HOST_DEVICE]) + self.state["gpu"] = self.state[pyop3.HOST_DEVICE] class MatBufferSpec(abc.ABC): pass diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 6e735e7126..3a0df520de 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -55,9 +55,9 @@ # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once - assert f.dat.buffer.state[gpu] == 1 # matches host + assert f.dat.buffer.state[str(gpu)] == 1 # matches host assert g.dat.buffer.state[host] == 0 # untouched - assert g.dat.buffer.state[gpu] == 1 # modified once + assert g.dat.buffer.state[str(gpu)] == 1 # modified once assert isinstance(f.dat.data_ro, np.ndarray) assert isinstance(g.dat.data_ro, np.ndarray) @@ -65,6 +65,6 @@ # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once -assert f.dat.buffer.state[gpu] == 1 # matches host +assert f.dat.buffer.state[str(gpu)] == 1 # matches host assert g.dat.buffer.state[host] == 1 # matches device -assert g.dat.buffer.state[gpu] == 1 # modified once +assert g.dat.buffer.state[str(gpu)] == 1 # modified once From b044c7afe834f3036a059bc9d27ad1cfd20eb3b6 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 29 Apr 2026 19:18:23 +0100 Subject: [PATCH 08/36] tofix: revised approach ensuring explicit choice of GPU device --- pyop3/__init__.py | 1 - pyop3/buffer.py | 56 +++++++++++++++++++------------------------ pyop3/device.py | 60 +++++++++++++++++++++++++++++++++-------------- pyop3_gpu_demo.py | 3 +-- 4 files changed, 68 insertions(+), 52 deletions(-) diff --git a/pyop3/__init__.py b/pyop3/__init__.py index e5590ac2d1..7ce9203b0a 100644 --- a/pyop3/__init__.py +++ b/pyop3/__init__.py @@ -90,7 +90,6 @@ def _init_likwid(): from pyop3.device import ( # noqa: F401 HOST_DEVICE, GPU, - offloading ) from pyop3.sf import StarForest, single_star_sf, local_sf import pyop3.sf diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 52f75c3f57..974eef9a6f 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -23,7 +23,7 @@ from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly from pyop3.device import ( HOST_DEVICE, - _current_device + Device ) from ._buffer_cy import set_petsc_mat_diagonal @@ -322,7 +322,7 @@ def dtype(self) -> np.dtype: return self._data.dtype def inc_state(self) -> None: - self._state[self.get_context()] += 1 + self._state[self.get_context_string()] += 1 # NOTE: Why is this using _lazy_data instead of _data? # TODO: Either adjust for _lazy_data mapping or switch to _data @@ -338,8 +338,11 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: is_nested: ClassVar[bool] = False - def get_context(self) -> str: - return str(_current_device.get()) + def get_context(self) -> Device: + return Device.current() + + def get_context_string(self) -> str: + return str(Device.current()) @property def handle(self) -> np.ndarray: @@ -489,41 +492,25 @@ def leaves_valid(self) -> bool: @property def _data(self): context = self.get_context() + context_str = self.get_context_string() - # TODO: This logic cannot be kept - ''' - Flaw in that host and gpu modifications will be out of sync. - This could lead to situations where multiple copies are made before GPU - context has reached the same modifications as CPU. - Even worse if data is called without being modified - copied but no state change. - - Solution: - - Copy state modifications after context window - - If states match, check if None, else copy. - - Do we want to free cupy space after context window? - ''' - if context == "gpu": - # If GPU has modifications >= CPU and not None, don't copy. - if self._lazy_data[context] is None or self.state[context] < self.state[pyop3.HOST_DEVICE]: + if context_str == "gpu": + if self._lazy_data[context_str] is None or self.state[context_str] < self.state[pyop3.HOST_DEVICE]: self._sync_to_device() - - elif context == "cpu": - # If CPU has modifications >= GPU and not None, don't copy. - if self._lazy_data[context] is None or self.state[context] < self.state["gpu"]: - self._sync_to_host() - - # if context == "cpu": - # if self._lazy_data[context] is None: - # self._lazy_data[context] = np.zeros(self.shape, dtype=self.dtype) - # elif context == "gpu": - # if self._lazy_data[context] is None: - # self._lazy_data[context] = cp.zeros(self.shape, dtype=self.dtype) + context.register(self) + + # if context_str == "cpu": + # if self._lazy_data[context_str] is None: + # self._lazy_data[context_str] = np.zeros(self.shape, dtype=self.dtype) + # elif context_str == "gpu": + # if self._lazy_data[context_str] is None: + # self._lazy_data[context_str] = cp.zeros(self.shape, dtype=self.dtype) # else: # raise RuntimeError("Offload device not supported") if self.name == "array_247_buffer": breakpoint() - return self._lazy_data[context] + return self._lazy_data[context_str] # TODO: I think the halo bits should only be handled at the Dat level via the # axis tree. Here we can just consider the array. @@ -638,6 +625,11 @@ def _localized(self) -> ArrayBuffer: return self.__record_init__(sf=None) # TODO: Consider whether to make these asynchronous and group with a sync + def maybe_sync_to_host(self): + # Only sync if modified whilst on GPU + self.state[pyop3.HOST_DEVICE] < self.state["gpu"]: + self._sync_to_host() + def _sync_to_host(self): self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data["gpu"]) self.state[pyop3.HOST_DEVICE] = self.state["gpu"] diff --git a/pyop3/device.py b/pyop3/device.py index 2bbc0c0c83..b94ebf812d 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -4,20 +4,29 @@ import contextvars import warnings +import cupy as cp + + # TODO: The constant should be elsewhere but temporary here HOST_DEVICE = "cpu" -# NOTE: Use contextvars to act as a bridge between buffer and manager classes +# NOTE: Use contextvars to act as a bridge between buffer and manager class _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) class Device(metaclass=ABCMeta): - _available: bool _name: str - _accessed_buffer: list + _registered_arrays: set - def __init__(self): - self._available = True - self._accessed_buffer = [] + def __init__(self, device: int | None = None): + pass + + @staticmethod + def current(): + device = _current_device.get() + return device + + @abstractmethod + def register(self, arr): pass @abstractmethod @@ -28,12 +37,38 @@ def __repr__(self): def __str__(self): pass +# TODO: Necessary to make this? +class CPU(Device): + pass + +# Implementation follows similar idea to CuPY (with GPU(): ...) class GPU(Device): - def __init__(self): + def __init__(self, device: int | None = None): super().__init__() self._name = "gpu" - pass + self._registered_arrays = set() + self._token = None + self.device = cp.cuda.Device(device) + + def __enter__(self): + self._token = _current_device.set(self) + return self + + def __exit__(self, type, value, traceback): + self.sync_buffers() + _current_device.reset(self._token) + self._reset_register() + + def sync_buffers(self): + for arr in self._registered_arrays: + arr.maybe_sync_to_host() + + def register(self, arr): + self._registered_arrays.add(arr) + + def _reset_register(self): + self._registered_arrays = set() def __repr__(self): return self._name @@ -41,12 +76,3 @@ def __repr__(self): def __str__(self): return self._name -@contextlib.contextmanager -def offloading(dev: Device): - token = _current_device.set(dev) - try: - yield - finally: - _current_device.reset(token) - # TODO: Update buffers and copy back - # NOTE: Does this have to happen lazily if we do not have a record of accessed buffers? diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 3a0df520de..4b830656f7 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -41,9 +41,8 @@ assert g.dat.buffer.state[host] == 0 # untouched assert g.dat.buffer.state[str(gpu)] == 0 # untouched -with op3.offloading(gpu): +with gpu: # Getting the .data attribute on the GPU should give us back a GPU array type - assert not isinstance(f.dat.data_ro, np.ndarray) assert not isinstance(g.dat.data_ro, np.ndarray) From 1de6320b2fc98eba58d4eecc63ecdd02d9198ef2 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Thu, 30 Apr 2026 10:54:20 +0100 Subject: [PATCH 09/36] implicit transfer and defaultdict implementation (pub/sub eager copy model) --- pyop3/__init__.py | 1 + pyop3/buffer.py | 81 +++++++++++++++++++++++++++-------------------- pyop3/device.py | 75 +++++++++++++++++++++++++++---------------- pyop3_gpu_demo.py | 14 ++++---- 4 files changed, 102 insertions(+), 69 deletions(-) diff --git a/pyop3/__init__.py b/pyop3/__init__.py index 7ce9203b0a..e5590ac2d1 100644 --- a/pyop3/__init__.py +++ b/pyop3/__init__.py @@ -90,6 +90,7 @@ def _init_likwid(): from pyop3.device import ( # noqa: F401 HOST_DEVICE, GPU, + offloading ) from pyop3.sf import StarForest, single_star_sf, local_sf import pyop3.sf diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 974eef9a6f..88f3d7c6ed 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -22,8 +22,8 @@ from pyop3.sf import DistributedObject, NullStarForest, StarForest, local_sf from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly from pyop3.device import ( - HOST_DEVICE, - Device + Device, + HOST_DEVICE ) from ._buffer_cy import set_petsc_mat_diagonal @@ -239,7 +239,7 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): _rank_equal: bool _ordered: bool # TODO: Mutable pair of variables corresponding to state for host & device - _state: dict[str, int] + _state: dict[Device, int] _max_value: np.number | None = None @@ -256,7 +256,16 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer, def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False): - data_mapping = {pyop3.HOST_DEVICE: None, "gpu": None} + data_mapping = {} + if isinstance(data, np.ndarray): + data_mapping[pyop3.HOST_DEVICE] = data + elif isinstance(data, cp.ndarray): + ctx = self.get_context() + # TODO: Raise custom exception if no Device context + if not isinstance(ctx, Device): + raise NotImplementedError + + data_mapping[ctx] = data if sf is None: sf = NullStarForest(data.size) @@ -270,11 +279,6 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, # TODO: CuPy has no support for `writeable` flag if constant: data.flags.writeable = False - - if isinstance(data, np.ndarray): - data_mapping[pyop3.HOST_DEVICE] = data - elif isinstance(data, cp.ndarray): - data_mapping["gpu"] = data self._lazy_data = data_mapping self.sf = sf @@ -283,8 +287,10 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, self._rank_equal = rank_equal self._max_value = max_value self._ordered = ordered - # TODO: Quite ugly, maybe switch to ENUM in future. - self._state = {pyop3.HOST_DEVICE: 0, "gpu": 0} + + # NOTE: Defaultdict approach, worth discussing with Connor + # TODO: Not acknowledging if created whilst in device context + self._state = collections.defaultdict(int, [(pyop3.HOST_DEVICE, 0)]) self.__post_init__() def __post_init__(self) -> None: @@ -322,7 +328,7 @@ def dtype(self) -> np.dtype: return self._data.dtype def inc_state(self) -> None: - self._state[self.get_context_string()] += 1 + self._state[self.get_context()] += 1 # NOTE: Why is this using _lazy_data instead of _data? # TODO: Either adjust for _lazy_data mapping or switch to _data @@ -341,9 +347,6 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: def get_context(self) -> Device: return Device.current() - def get_context_string(self) -> str: - return str(Device.current()) - @property def handle(self) -> np.ndarray: return self._data @@ -491,26 +494,24 @@ def leaves_valid(self) -> bool: @property def _data(self): - context = self.get_context() - context_str = self.get_context_string() + ctx = self.get_context() - if context_str == "gpu": - if self._lazy_data[context_str] is None or self.state[context_str] < self.state[pyop3.HOST_DEVICE]: - self._sync_to_device() - context.register(self) + # NOTE: Consequence of implicit transfers + if ctx not in self._lazy_data: + self._lazy_data[ctx] = None - # if context_str == "cpu": - # if self._lazy_data[context_str] is None: - # self._lazy_data[context_str] = np.zeros(self.shape, dtype=self.dtype) - # elif context_str == "gpu": - # if self._lazy_data[context_str] is None: - # self._lazy_data[context_str] = cp.zeros(self.shape, dtype=self.dtype) - # else: - # raise RuntimeError("Offload device not supported") + # TODO: Don't like explicit GPU reference + if ctx.name == "gpu": + if self._lazy_data[ctx] is None or self.state[ctx] < self.state[pyop3.HOST_DEVICE]: + self._sync_to_device() + ctx.register(self) + + # if self._lazy_data[ctx_str] is None: + # self._lazy_data[ctx_str] = np.zeros(self.shape, dtype=self.dtype) if self.name == "array_247_buffer": breakpoint() - return self._lazy_data[context_str] + return self._lazy_data[ctx] # TODO: I think the halo bits should only be handled at the Dat level via the # axis tree. Here we can just consider the array. @@ -627,16 +628,26 @@ def _localized(self) -> ArrayBuffer: # TODO: Consider whether to make these asynchronous and group with a sync def maybe_sync_to_host(self): # Only sync if modified whilst on GPU - self.state[pyop3.HOST_DEVICE] < self.state["gpu"]: + if self.state[pyop3.HOST_DEVICE] < self.state[self.get_context()]: self._sync_to_host() def _sync_to_host(self): - self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data["gpu"]) - self.state[pyop3.HOST_DEVICE] = self.state["gpu"] + context = self.get_context() + # TODO: Raise exception (warning?) as user on host already + if context is pyop3.HOST_DEVICE: + raise NotImplementedError + + self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data[context]) + self.state[pyop3.HOST_DEVICE] = self.state[context] def _sync_to_device(self): - self._lazy_data["gpu"] = cp.asarray(self._lazy_data[pyop3.HOST_DEVICE]) - self.state["gpu"] = self.state[pyop3.HOST_DEVICE] + context = self.get_context() + # TODO: Raise exception (warning?) as user on host already + if context is pyop3.HOST_DEVICE: + raise NotImplementedError + + self._lazy_data[context] = cp.asarray(self._lazy_data[pyop3.HOST_DEVICE]) + self.state[context] = self.state[pyop3.HOST_DEVICE] class MatBufferSpec(abc.ABC): pass diff --git a/pyop3/device.py b/pyop3/device.py index b94ebf812d..f717cea7ad 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -6,13 +6,6 @@ import cupy as cp - -# TODO: The constant should be elsewhere but temporary here -HOST_DEVICE = "cpu" - -# NOTE: Use contextvars to act as a bridge between buffer and manager class -_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) - class Device(metaclass=ABCMeta): _name: str _registered_arrays: set @@ -24,24 +17,48 @@ def __init__(self, device: int | None = None): def current(): device = _current_device.get() return device + + @abstractmethod + def sync_buffers(self): + pass @abstractmethod def register(self, arr): pass @abstractmethod - def __repr__(self): - pass + def _reset_register(self): + pass - @abstractmethod + @property + def name(self): + return self._name + + def __repr__(self): + return self._name + def __str__(self): - pass + return self._name -# TODO: Necessary to make this? class CPU(Device): - pass -# Implementation follows similar idea to CuPY (with GPU(): ...) + def __init__(self, device: int | None = None): + super().__init__() + self._name = "cpu" + self._registered_arrays = set() + self.device = None + + # NOTE: Is it necessary to have any implementation here? + # Maybe perfunctory implementations but no real purpose + def sync_buffers(self): + pass + + def register(self, arr): + self._registered_arrays.add(arr) + + def _reset_register(self): + self._registered_arrays = set() + class GPU(Device): def __init__(self, device: int | None = None): @@ -51,15 +68,6 @@ def __init__(self, device: int | None = None): self._token = None self.device = cp.cuda.Device(device) - def __enter__(self): - self._token = _current_device.set(self) - return self - - def __exit__(self, type, value, traceback): - self.sync_buffers() - _current_device.reset(self._token) - self._reset_register() - def sync_buffers(self): for arr in self._registered_arrays: arr.maybe_sync_to_host() @@ -70,9 +78,22 @@ def register(self, arr): def _reset_register(self): self._registered_arrays = set() - def __repr__(self): - return self._name +@contextlib.contextmanager +def offloading(device: Device): + # TODO: Not device exception + if not isinstance(device, Device): + raise NotImplementedError - def __str__(self): - return self._name + token = _current_device.set(device) + try: + yield + finally: + device.sync_buffers() + device._reset_register() + _current_device.reset(token) + +# TODO: Should this const variable be here? +HOST_DEVICE = CPU() +# NOTE: Use contextvars to act as a bridge between buffer and manager class +_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 4b830656f7..6802e5fea6 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -37,11 +37,11 @@ # state tracking checks, .buffer.state is now device-specific assert f.dat.buffer.state[host] == 1 # modified once -assert f.dat.buffer.state[str(gpu)] == 0 # untouched +assert f.dat.buffer.state[gpu] == 0 # untouched assert g.dat.buffer.state[host] == 0 # untouched -assert g.dat.buffer.state[str(gpu)] == 0 # untouched +assert g.dat.buffer.state[gpu] == 0 # untouched -with gpu: +with op3.offloading(gpu): # Getting the .data attribute on the GPU should give us back a GPU array type assert not isinstance(f.dat.data_ro, np.ndarray) assert not isinstance(g.dat.data_ro, np.ndarray) @@ -54,9 +54,9 @@ # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once - assert f.dat.buffer.state[str(gpu)] == 1 # matches host + assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 0 # untouched - assert g.dat.buffer.state[str(gpu)] == 1 # modified once + assert g.dat.buffer.state[gpu] == 1 # modified once assert isinstance(f.dat.data_ro, np.ndarray) assert isinstance(g.dat.data_ro, np.ndarray) @@ -64,6 +64,6 @@ # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once -assert f.dat.buffer.state[str(gpu)] == 1 # matches host +assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 1 # matches device -assert g.dat.buffer.state[str(gpu)] == 1 # modified once +assert g.dat.buffer.state[gpu] == 1 # modified once From 6a263349f137bf6793b773e9846020f80301b341 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Thu, 30 Apr 2026 12:02:58 +0100 Subject: [PATCH 10/36] explicit check if GPU available on init --- pyop3/buffer.py | 5 +++-- pyop3/device.py | 36 +++++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 88f3d7c6ed..ac984790de 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -626,8 +626,9 @@ def _localized(self) -> ArrayBuffer: return self.__record_init__(sf=None) # TODO: Consider whether to make these asynchronous and group with a sync + # NOTE: Perhaps it is easier to allow user to make it non/blocking def maybe_sync_to_host(self): - # Only sync if modified whilst on GPU + # NOTE: Ugly but won't throw error if context is host if self.state[pyop3.HOST_DEVICE] < self.state[self.get_context()]: self._sync_to_host() @@ -642,7 +643,7 @@ def _sync_to_host(self): def _sync_to_device(self): context = self.get_context() - # TODO: Raise exception (warning?) as user on host already + # TODO: Raise exception as user still on host if context is pyop3.HOST_DEVICE: raise NotImplementedError diff --git a/pyop3/device.py b/pyop3/device.py index f717cea7ad..6823d96c0f 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -4,13 +4,12 @@ import contextvars import warnings -import cupy as cp - class Device(metaclass=ABCMeta): _name: str _registered_arrays: set + _device_index: int | None - def __init__(self, device: int | None = None): + def __init__(self, device_index: int | None = None): pass @staticmethod @@ -34,6 +33,10 @@ def _reset_register(self): def name(self): return self._name + @property + def device_index(self): + return self._device_index + def __repr__(self): return self._name @@ -42,11 +45,11 @@ def __str__(self): class CPU(Device): - def __init__(self, device: int | None = None): + def __init__(self, device_index: int | None = None): super().__init__() self._name = "cpu" self._registered_arrays = set() - self.device = None + self._device_index = device_index # NOTE: Is it necessary to have any implementation here? # Maybe perfunctory implementations but no real purpose @@ -61,12 +64,19 @@ def _reset_register(self): class GPU(Device): - def __init__(self, device: int | None = None): + def __init__(self, device_index: int | None = None): super().__init__() self._name = "gpu" self._registered_arrays = set() self._token = None - self.device = cp.cuda.Device(device) + self._device_index = device_index + + try: + import cupy as cp + assert cp.is_available() + except: + # Raise No GPU exception + raise NotImplementedError def sync_buffers(self): for arr in self._registered_arrays: @@ -79,17 +89,17 @@ def _reset_register(self): self._registered_arrays = set() @contextlib.contextmanager -def offloading(device: Device): - # TODO: Not device exception - if not isinstance(device, Device): +def offloading(dev: Device): + # TODO: Not Device exception + if not isinstance(dev, Device): raise NotImplementedError - token = _current_device.set(device) + token = _current_device.set(dev) try: yield finally: - device.sync_buffers() - device._reset_register() + dev.sync_buffers() + dev._reset_register() _current_device.reset(token) # TODO: Should this const variable be here? From 8d1f967c49b69ed44a599b981689e87eb388be82 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Thu, 30 Apr 2026 16:38:35 +0100 Subject: [PATCH 11/36] cudagpu and fix incoming re: remove eager copying/register & dev syncing --- pyop3/buffer.py | 6 +++--- pyop3/device.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index ac984790de..c83cc59459 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -232,14 +232,14 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): # {{{ Instance attrs # TODO: Update to lazily allocated pair of host/device arrays - _lazy_data: dict[str, np.ndarray | cp.ndarray] = dataclasses.field(repr=False) + _lazy_data: dict[Device, np.ndarray | cp.ndarray] = dataclasses.field(repr=False) sf: StarForest _name: str _constant: bool _rank_equal: bool _ordered: bool - # TODO: Mutable pair of variables corresponding to state for host & device - _state: dict[Device, int] + # TODO: Should this be a defaultdict? + _state: collections.defaultdict[Device, int] _max_value: np.number | None = None diff --git a/pyop3/device.py b/pyop3/device.py index 6823d96c0f..f12375ba21 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -75,7 +75,7 @@ def __init__(self, device_index: int | None = None): import cupy as cp assert cp.is_available() except: - # Raise No GPU exception + # TODO: Raise No GPU exception raise NotImplementedError def sync_buffers(self): From 9a729e916686f72fcbc5a89fd85656a070feb062 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Fri, 1 May 2026 11:03:13 +0100 Subject: [PATCH 12/36] move conversion logic to device.py --- pyop3/__init__.py | 2 +- pyop3/buffer.py | 72 +++++++++++++++++++++++------------------------ pyop3/device.py | 58 ++++++++++++++------------------------ pyop3_gpu_demo.py | 4 ++- 4 files changed, 61 insertions(+), 75 deletions(-) diff --git a/pyop3/__init__.py b/pyop3/__init__.py index e5590ac2d1..e53a7360e7 100644 --- a/pyop3/__init__.py +++ b/pyop3/__init__.py @@ -89,7 +89,7 @@ def _init_likwid(): ) from pyop3.device import ( # noqa: F401 HOST_DEVICE, - GPU, + CUDAGPU, offloading ) from pyop3.sf import StarForest, single_star_sf, local_sf diff --git a/pyop3/buffer.py b/pyop3/buffer.py index c83cc59459..051fcfe66f 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -23,6 +23,8 @@ from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly from pyop3.device import ( Device, + CUDAGPU, + CPU, HOST_DEVICE ) @@ -240,6 +242,7 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): _ordered: bool # TODO: Should this be a defaultdict? _state: collections.defaultdict[Device, int] + _last_updated_device: Device _max_value: np.number | None = None @@ -287,10 +290,11 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, self._rank_equal = rank_equal self._max_value = max_value self._ordered = ordered + self._last_updated_device = self.get_context() - # NOTE: Defaultdict approach, worth discussing with Connor + # NOTE: Connor and I don't like defaultdict, unsure how to change it atm # TODO: Not acknowledging if created whilst in device context - self._state = collections.defaultdict(int, [(pyop3.HOST_DEVICE, 0)]) + self._state = collections.defaultdict(int, [(self.get_context(), 0)]) self.__post_init__() def __post_init__(self) -> None: @@ -328,7 +332,9 @@ def dtype(self) -> np.dtype: return self._data.dtype def inc_state(self) -> None: - self._state[self.get_context()] += 1 + ctx = self.get_context() + self._state[ctx] += 1 + self._last_updated_device = ctx # NOTE: Why is this using _lazy_data instead of _data? # TODO: Either adjust for _lazy_data mapping or switch to _data @@ -496,18 +502,12 @@ def leaves_valid(self) -> bool: def _data(self): ctx = self.get_context() - # NOTE: Consequence of implicit transfers - if ctx not in self._lazy_data: - self._lazy_data[ctx] = None - - # TODO: Don't like explicit GPU reference - if ctx.name == "gpu": - if self._lazy_data[ctx] is None or self.state[ctx] < self.state[pyop3.HOST_DEVICE]: - self._sync_to_device() - ctx.register(self) + if not self._is_data_available(ctx) or not self._is_data_synced(ctx): + self.sync_devices(ctx) - # if self._lazy_data[ctx_str] is None: - # self._lazy_data[ctx_str] = np.zeros(self.shape, dtype=self.dtype) + # NOTE: If data is None, set to zeros? + # if self._lazy_data is None: + # self._lazy_data = np.zeros(self.shape, dtype=self.dtype) if self.name == "array_247_buffer": breakpoint() @@ -627,28 +627,28 @@ def _localized(self) -> ArrayBuffer: # TODO: Consider whether to make these asynchronous and group with a sync # NOTE: Perhaps it is easier to allow user to make it non/blocking - def maybe_sync_to_host(self): - # NOTE: Ugly but won't throw error if context is host - if self.state[pyop3.HOST_DEVICE] < self.state[self.get_context()]: - self._sync_to_host() - - def _sync_to_host(self): - context = self.get_context() - # TODO: Raise exception (warning?) as user on host already - if context is pyop3.HOST_DEVICE: - raise NotImplementedError - - self._lazy_data[pyop3.HOST_DEVICE] = cp.asnumpy(self._lazy_data[context]) - self.state[pyop3.HOST_DEVICE] = self.state[context] - - def _sync_to_device(self): - context = self.get_context() - # TODO: Raise exception as user still on host - if context is pyop3.HOST_DEVICE: - raise NotImplementedError - - self._lazy_data[context] = cp.asarray(self._lazy_data[pyop3.HOST_DEVICE]) - self.state[context] = self.state[pyop3.HOST_DEVICE] + def sync_devices(self, current_device: Device): + last_updated_device = self._last_updated_device + + self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device]) + + # if isinstance(current_device, CUDAGPU): + # self._lazy_data[current_device] = cp.asarray(self._lazy_data[last_updated_device]) + # elif isinstance(current_device, CPU): + # if isinstance(last_updated_device, CUDAGPU): + # self._lazy_data[current_device] = cp.asnumpy(self._lazy_data[last_updated_device]) + # elif isinstance(last_updated_device, CPU): + # self._lazy_data[current_device] = np.array(self._lazy_data[last_updated_device]) + # else: + # raise NotImplementedError + + self._state[current_device] = self._state[last_updated_device] + + def _is_data_available(self, device: Device): + return device in self._lazy_data + + def _is_data_synced(self, device: Device): + return self.state[device] == max(self.state.values()) class MatBufferSpec(abc.ABC): pass diff --git a/pyop3/device.py b/pyop3/device.py index f12375ba21..eadc74aaaf 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -4,30 +4,21 @@ import contextvars import warnings +import numpy as np + class Device(metaclass=ABCMeta): _name: str - _registered_arrays: set _device_index: int | None def __init__(self, device_index: int | None = None): pass + # TODO: Method should not be associated with object + # NOTE: Would be weird to get current device as CPU whilst requesting on GPU @staticmethod def current(): device = _current_device.get() return device - - @abstractmethod - def sync_buffers(self): - pass - - @abstractmethod - def register(self, arr): - pass - - @abstractmethod - def _reset_register(self): - pass @property def name(self): @@ -37,6 +28,10 @@ def name(self): def device_index(self): return self._device_index + @abstractmethod + def asarray(self, arr): + pass + def __repr__(self): return self._name @@ -51,42 +46,33 @@ def __init__(self, device_index: int | None = None): self._registered_arrays = set() self._device_index = device_index - # NOTE: Is it necessary to have any implementation here? - # Maybe perfunctory implementations but no real purpose - def sync_buffers(self): - pass - - def register(self, arr): - self._registered_arrays.add(arr) - - def _reset_register(self): - self._registered_arrays = set() + def asarray(self, arr): + # NOTE: Better logic if we switch from just NumPy/CuPy + if not isinstance(arr, np.ndarray): + import cupy as cp + return cp.asnumpy(arr) + + return np.array(arr) -class GPU(Device): +class CUDAGPU(Device): def __init__(self, device_index: int | None = None): super().__init__() - self._name = "gpu" + self._name = "CudaGPU" self._registered_arrays = set() self._token = None self._device_index = device_index try: import cupy as cp - assert cp.is_available() + assert cp.is_available() except: # TODO: Raise No GPU exception raise NotImplementedError - def sync_buffers(self): - for arr in self._registered_arrays: - arr.maybe_sync_to_host() - - def register(self, arr): - self._registered_arrays.add(arr) - - def _reset_register(self): - self._registered_arrays = set() + def asarray(self, arr): + import cupy as cp + return cp.asarray(arr) @contextlib.contextmanager def offloading(dev: Device): @@ -98,8 +84,6 @@ def offloading(dev: Device): try: yield finally: - dev.sync_buffers() - dev._reset_register() _current_device.reset(token) # TODO: Should this const variable be here? diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 6802e5fea6..b124ba1c21 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -24,7 +24,7 @@ # made up API, we need some way to identify the device host = op3.HOST_DEVICE # or similar -gpu = op3.GPU() +gpu = op3.CUDAGPU() mesh = UnitSquareMesh(3, 3) V = FunctionSpace(mesh, "P", 2) @@ -58,6 +58,8 @@ assert g.dat.buffer.state[host] == 0 # untouched assert g.dat.buffer.state[gpu] == 1 # modified once +# print(f"{g.dat.buffer._lazy_data=}") +# print(f"{f.dat.buffer._lazy_data=}") assert isinstance(f.dat.data_ro, np.ndarray) assert isinstance(g.dat.data_ro, np.ndarray) assert (g.dat.data_ro == 23).all() From 21bac65b4f1b31efeb27552705ff5a0190c7bed8 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Fri, 1 May 2026 12:59:29 +0100 Subject: [PATCH 13/36] managing buffer duplicate --- pyop3/buffer.py | 61 +++++++++++++++++------------------------------ pyop3/device.py | 24 +++++++++++-------- pyop3/utils.py | 16 ++++++++++--- pyop3_gpu_demo.py | 3 +++ 4 files changed, 52 insertions(+), 52 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 051fcfe66f..6190689c2e 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -11,7 +11,6 @@ from typing import Any, ClassVar, Hashable import numpy as np -import cupy as cp from mpi4py import MPI from petsc4py import PETSc @@ -25,7 +24,8 @@ Device, CUDAGPU, CPU, - HOST_DEVICE + HOST_DEVICE, + _current_device ) from ._buffer_cy import set_petsc_mat_diagonal @@ -259,17 +259,6 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer, def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False): - data_mapping = {} - if isinstance(data, np.ndarray): - data_mapping[pyop3.HOST_DEVICE] = data - elif isinstance(data, cp.ndarray): - ctx = self.get_context() - # TODO: Raise custom exception if no Device context - if not isinstance(ctx, Device): - raise NotImplementedError - - data_mapping[ctx] = data - if sf is None: sf = NullStarForest(data.size) name = utils.maybe_generate_name(name, prefix, self.DEFAULT_PREFIX) @@ -279,10 +268,10 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, if rank_equal and not constant: raise ValueError - # TODO: CuPy has no support for `writeable` flag - if constant: - data.flags.writeable = False - + ctx = self.get_context() + data_mapping = {} + data_mapping[ctx] = ctx.asarray(data) + self._lazy_data = data_mapping self.sf = sf self._name = name @@ -293,8 +282,12 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, self._last_updated_device = self.get_context() # NOTE: Connor and I don't like defaultdict, unsure how to change it atm - # TODO: Not acknowledging if created whilst in device context - self._state = collections.defaultdict(int, [(self.get_context(), 0)]) + self._state = collections.defaultdict(int, [(ctx, 0)]) + + # TODO: CuPy has no support for `writeable` flag + if constant and isinstance(self._data, np.ndarray): + self._data.flags.writeable = False + self.__post_init__() def __post_init__(self) -> None: @@ -303,7 +296,7 @@ def __post_init__(self) -> None: assert self.constant if self.ordered: utils.debug_assert(lambda: utils.is_sorted(self._lazy_data)) - if self.constant: + if self.constant and isinstance(self._data, np.ndarray): assert not self._data.flags.writeable # }}} @@ -336,22 +329,21 @@ def inc_state(self) -> None: self._state[ctx] += 1 self._last_updated_device = ctx - # NOTE: Why is this using _lazy_data instead of _data? # TODO: Either adjust for _lazy_data mapping or switch to _data def duplicate(self, *, copy: bool = False) -> ArrayBuffer: # make sure that there are no pending transfers before we copy self.assemble() name = f"{self.name}_copy" if copy: - data = self._lazy_data.copy() + data = {obj: arr.copy() for obj, arr in self._lazy_data.items()} else: - data = np.zeros_like(self._lazy_data) + data = {obj: obj.zeros_like(arr) for obj, arr in self._lazy_data.items()} return self.__record_init__(_name=name, _lazy_data=data) is_nested: ClassVar[bool] = False def get_context(self) -> Device: - return Device.current() + return _current_device.get() @property def handle(self) -> np.ndarray: @@ -625,29 +617,20 @@ def localize(self) -> ArrayBuffer: def _localized(self) -> ArrayBuffer: return self.__record_init__(sf=None) - # TODO: Consider whether to make these asynchronous and group with a sync - # NOTE: Perhaps it is easier to allow user to make it non/blocking def sync_devices(self, current_device: Device): last_updated_device = self._last_updated_device self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device]) - - # if isinstance(current_device, CUDAGPU): - # self._lazy_data[current_device] = cp.asarray(self._lazy_data[last_updated_device]) - # elif isinstance(current_device, CPU): - # if isinstance(last_updated_device, CUDAGPU): - # self._lazy_data[current_device] = cp.asnumpy(self._lazy_data[last_updated_device]) - # elif isinstance(last_updated_device, CPU): - # self._lazy_data[current_device] = np.array(self._lazy_data[last_updated_device]) - # else: - # raise NotImplementedError - self._state[current_device] = self._state[last_updated_device] - def _is_data_available(self, device: Device): + # NOTE: Current fix for CuPy having no `writeable support` or maintaining flags + if self.constant and isinstance(self._lazy_data[current_device], np.ndarray): + self._lazy_data[current_device].flags.writeable = False + + def _is_data_available(self, device: Device) -> bool: return device in self._lazy_data - def _is_data_synced(self, device: Device): + def _is_data_synced(self, device: Device) -> bool: return self.state[device] == max(self.state.values()) class MatBufferSpec(abc.ABC): diff --git a/pyop3/device.py b/pyop3/device.py index eadc74aaaf..0ad464f350 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -13,13 +13,6 @@ class Device(metaclass=ABCMeta): def __init__(self, device_index: int | None = None): pass - # TODO: Method should not be associated with object - # NOTE: Would be weird to get current device as CPU whilst requesting on GPU - @staticmethod - def current(): - device = _current_device.get() - return device - @property def name(self): return self._name @@ -32,6 +25,10 @@ def device_index(self): def asarray(self, arr): pass + @abstractmethod + def zeros_like(self, arr): + pass + def __repr__(self): return self._name @@ -47,12 +44,15 @@ def __init__(self, device_index: int | None = None): self._device_index = device_index def asarray(self, arr): - # NOTE: Better logic if we switch from just NumPy/CuPy + # NOTE: Better logic needed if we switch from just NumPy/CuPy if not isinstance(arr, np.ndarray): import cupy as cp return cp.asnumpy(arr) - return np.array(arr) + return np.array(arr) + + def zeros_like(self, arr): + return np.zeros_like(arr) class CUDAGPU(Device): @@ -73,6 +73,10 @@ def __init__(self, device_index: int | None = None): def asarray(self, arr): import cupy as cp return cp.asarray(arr) + + def zeros_like(self, arr): + import cupy as cp + return cp.zeros_like(arr) @contextlib.contextmanager def offloading(dev: Device): @@ -89,5 +93,5 @@ def offloading(dev: Device): # TODO: Should this const variable be here? HOST_DEVICE = CPU() -# NOTE: Use contextvars to act as a bridge between buffer and manager class +# NOTE: Use contextvars to act as a bridge between buffer and manager _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) diff --git a/pyop3/utils.py b/pyop3/utils.py index b9f34d4e10..7f991416af 100644 --- a/pyop3/utils.py +++ b/pyop3/utils.py @@ -14,7 +14,6 @@ import cachetools import numpy as np -import cupy as cp import pytools from immutabledict import immutabledict from mpi4py import MPI @@ -27,6 +26,13 @@ from pyop3.exceptions import CommMismatchException, CommNotFoundException, Pyop3Exception, UnhashableObjectException, UnsupportedArrayException from pyop3.mpi import collective +ndarray_types = [np.ndarray,] +try: + import cupy as cp + ndarray_types = [np.ndarray, cp.ndarray] +except ImportError: + pass + class UniqueNameGenerator(pytools.UniqueNameGenerator): """Class for generating unique names.""" @@ -170,7 +176,7 @@ def is_sequence(item): def flatten(iterable): """Recursively flatten a nested iterable.""" - if isinstance(iterable, np.ndarray): + if isinstance(iterable, tuple(ndarray_types)): return iterable.flatten() if not isinstance(iterable, (list, tuple)): return (iterable,) @@ -587,8 +593,12 @@ def pretty_type(obj: Any) -> str: def safe_equals(a, b, /) -> bool: - if any(isinstance(x, np.ndarray) for x in [a, b]): + if any(isinstance(x, tuple(ndarray_types)) for x in [a, b]): return (a == b).all() + if any(isinstance(x, dict) for x in [a, b]): + if a.keys() != b.keys(): + return False + return all(safe_equals(a[k], b[k]) for k in a) else: return bool(a == b) diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index b124ba1c21..ec9f1c7b22 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -51,6 +51,9 @@ # Do the assignment using MLIR (this is a later step) # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile") + k = Function(V) + k.dat.buffer.duplicate() + k.dat.buffer.duplicate(copy=True) # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once From 67fe76a1db9ece12458e752bc173ce29b9ceb87a Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Fri, 1 May 2026 13:05:09 +0100 Subject: [PATCH 14/36] cleanup unnecessary todos/notes --- pyop3/buffer.py | 3 +-- pyop3/device.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 6190689c2e..94dbe80342 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -233,13 +233,13 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): # {{{ Instance attrs - # TODO: Update to lazily allocated pair of host/device arrays _lazy_data: dict[Device, np.ndarray | cp.ndarray] = dataclasses.field(repr=False) sf: StarForest _name: str _constant: bool _rank_equal: bool _ordered: bool + # TODO: Should this be a defaultdict? _state: collections.defaultdict[Device, int] _last_updated_device: Device @@ -329,7 +329,6 @@ def inc_state(self) -> None: self._state[ctx] += 1 self._last_updated_device = ctx - # TODO: Either adjust for _lazy_data mapping or switch to _data def duplicate(self, *, copy: bool = False) -> ArrayBuffer: # make sure that there are no pending transfers before we copy self.assemble() diff --git a/pyop3/device.py b/pyop3/device.py index 0ad464f350..d5b56f9487 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -90,7 +90,7 @@ def offloading(dev: Device): finally: _current_device.reset(token) -# TODO: Should this const variable be here? +# NOTE: Should this const variable be here? HOST_DEVICE = CPU() # NOTE: Use contextvars to act as a bridge between buffer and manager From 156831573bde4678c925f267624222d3181ed830 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 10:21:08 +0100 Subject: [PATCH 15/36] removing notes and cleaning --- pyop3/buffer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 94dbe80342..86174c335f 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -240,7 +240,7 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): _rank_equal: bool _ordered: bool - # TODO: Should this be a defaultdict? + # TODO: Connor and I both dislike defaultdict but I can't think of an alternative atm _state: collections.defaultdict[Device, int] _last_updated_device: Device @@ -269,8 +269,7 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, raise ValueError ctx = self.get_context() - data_mapping = {} - data_mapping[ctx] = ctx.asarray(data) + data_mapping = {ctx: ctx.asarray(data)} self._lazy_data = data_mapping self.sf = sf @@ -279,9 +278,8 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, self._rank_equal = rank_equal self._max_value = max_value self._ordered = ordered - self._last_updated_device = self.get_context() + self._last_updated_device = ctx - # NOTE: Connor and I don't like defaultdict, unsure how to change it atm self._state = collections.defaultdict(int, [(ctx, 0)]) # TODO: CuPy has no support for `writeable` flag From 25179949f1ccf5b30bd9788ce823e7ece9b2d2e0 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 11:02:59 +0100 Subject: [PATCH 16/36] fix: added copy to avoid weak reference - new state object (int -> dict) leads the reassignment being a weak reference. --- firedrake/mesh.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/firedrake/mesh.py b/firedrake/mesh.py index eb4be68b18..cea52bbcd6 100644 --- a/firedrake/mesh.py +++ b/firedrake/mesh.py @@ -3772,7 +3772,7 @@ def __init__(self, coordinates): self._bounding_box_coords = None self._spatial_index = None - self._saved_coordinate_dat_version = coordinates.dat.buffer.state + self._saved_coordinate_dat_version = coordinates.dat.buffer.state.copy() self._cache = {} @@ -4009,7 +4009,7 @@ def spatial_index(self): # Build spatial index with PETSc.Log.Event("spatial_index_build"): self._spatial_index = spatialindex.from_regions(coords_min, coords_max) - self._saved_coordinate_dat_version = self.coordinates.dat.buffer.state + self._saved_coordinate_dat_version = self.coordinates.dat.buffer.state.copy() return self._spatial_index @PETSc.Log.EventDecorator() From 9e7c6adbd9a0fe19caae62500d1f2b1195c0a76e Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 11:10:46 +0100 Subject: [PATCH 17/36] test: data_wo access works in context --- pyop3_gpu_demo.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index ec9f1c7b22..86dcbaad8d 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -55,20 +55,27 @@ k.dat.buffer.duplicate() k.dat.buffer.duplicate(copy=True) + k.dat.data_wo[...] = 1 + # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 0 # untouched assert g.dat.buffer.state[gpu] == 1 # modified once + assert k.dat.buffer.state[host] == 0 # modified once + assert k.dat.buffer.state[gpu] == 1 # modified once # print(f"{g.dat.buffer._lazy_data=}") # print(f"{f.dat.buffer._lazy_data=}") assert isinstance(f.dat.data_ro, np.ndarray) assert isinstance(g.dat.data_ro, np.ndarray) assert (g.dat.data_ro == 23).all() +assert (k.dat.data_ro == 1).all() # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 1 # matches device assert g.dat.buffer.state[gpu] == 1 # modified once +assert k.dat.buffer.state[host] == 1 # modified once +assert k.dat.buffer.state[gpu] == 1 # modified once From cb5f28ed242f55df5f690641f4841b2542ca5657 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 11:23:05 +0100 Subject: [PATCH 18/36] add flatten from prev logic --- pyop3/buffer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 86174c335f..b9e4d81b5b 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -268,6 +268,7 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, if rank_equal and not constant: raise ValueError + data = data.flatten() ctx = self.get_context() data_mapping = {ctx: ctx.asarray(data)} From 36d2b07728c5db590a3c1e6ff7f84ad3710de2b6 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 14:04:12 +0100 Subject: [PATCH 19/36] context function as global function and def state - defaultdict gives -1 as default value if device object does not exist --- pyop3/buffer.py | 27 ++++++++++++--------------- pyop3/device.py | 3 +++ pyop3_gpu_demo.py | 6 +++--- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index b9e4d81b5b..4578d3d065 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -25,7 +25,7 @@ CUDAGPU, CPU, HOST_DEVICE, - _current_device + get_current_device ) from ._buffer_cy import set_petsc_mat_diagonal @@ -269,8 +269,8 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, raise ValueError data = data.flatten() - ctx = self.get_context() - data_mapping = {ctx: ctx.asarray(data)} + curr_dev = get_current_device() + data_mapping = {curr_dev: curr_dev.asarray(data)} self._lazy_data = data_mapping self.sf = sf @@ -279,9 +279,9 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, self._rank_equal = rank_equal self._max_value = max_value self._ordered = ordered - self._last_updated_device = ctx + self._last_updated_device = curr_dev - self._state = collections.defaultdict(int, [(ctx, 0)]) + self._state = collections.defaultdict(lambda: -1, [(curr_dev, 0)]) # TODO: CuPy has no support for `writeable` flag if constant and isinstance(self._data, np.ndarray): @@ -324,9 +324,9 @@ def dtype(self) -> np.dtype: return self._data.dtype def inc_state(self) -> None: - ctx = self.get_context() - self._state[ctx] += 1 - self._last_updated_device = ctx + curr_dev = get_current_device() + self._state[curr_dev] += 1 + self._last_updated_device = curr_dev def duplicate(self, *, copy: bool = False) -> ArrayBuffer: # make sure that there are no pending transfers before we copy @@ -340,9 +340,6 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: is_nested: ClassVar[bool] = False - def get_context(self) -> Device: - return _current_device.get() - @property def handle(self) -> np.ndarray: return self._data @@ -490,10 +487,10 @@ def leaves_valid(self) -> bool: @property def _data(self): - ctx = self.get_context() + curr_dev = get_current_device() - if not self._is_data_available(ctx) or not self._is_data_synced(ctx): - self.sync_devices(ctx) + if not self._is_data_available(curr_dev) or not self._is_data_synced(curr_dev): + self.sync_devices(curr_dev) # NOTE: If data is None, set to zeros? # if self._lazy_data is None: @@ -501,7 +498,7 @@ def _data(self): if self.name == "array_247_buffer": breakpoint() - return self._lazy_data[ctx] + return self._lazy_data[curr_dev] # TODO: I think the halo bits should only be handled at the Dat level via the # axis tree. Here we can just consider the array. diff --git a/pyop3/device.py b/pyop3/device.py index d5b56f9487..2fd0810d7f 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -95,3 +95,6 @@ def offloading(dev: Device): # NOTE: Use contextvars to act as a bridge between buffer and manager _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) + +def get_current_device(): + return _current_device.get() diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 86dcbaad8d..027c9a6957 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -37,9 +37,9 @@ # state tracking checks, .buffer.state is now device-specific assert f.dat.buffer.state[host] == 1 # modified once -assert f.dat.buffer.state[gpu] == 0 # untouched +assert f.dat.buffer.state[gpu] == -1 # not created assert g.dat.buffer.state[host] == 0 # untouched -assert g.dat.buffer.state[gpu] == 0 # untouched +assert g.dat.buffer.state[gpu] == -1 # not created with op3.offloading(gpu): # Getting the .data attribute on the GPU should give us back a GPU array type @@ -62,7 +62,7 @@ assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 0 # untouched assert g.dat.buffer.state[gpu] == 1 # modified once - assert k.dat.buffer.state[host] == 0 # modified once + assert k.dat.buffer.state[host] == -1 # not created assert k.dat.buffer.state[gpu] == 1 # modified once # print(f"{g.dat.buffer._lazy_data=}") From e5a010746f832019a261f9b8e679366ed2df005a Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 14:14:59 +0100 Subject: [PATCH 20/36] fix: maintaining constant array property - constant property was lost between cupy/numpy conversions - fixed by passing kwarg that is disregarded for cupy but used for numpy --- pyop3/buffer.py | 17 +++++------------ pyop3/device.py | 17 +++++++++++------ 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 4578d3d065..053f739fa1 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -268,11 +268,8 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, if rank_equal and not constant: raise ValueError - data = data.flatten() curr_dev = get_current_device() - data_mapping = {curr_dev: curr_dev.asarray(data)} - self._lazy_data = data_mapping self.sf = sf self._name = name self._constant = constant @@ -281,11 +278,11 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, self._ordered = ordered self._last_updated_device = curr_dev - self._state = collections.defaultdict(lambda: -1, [(curr_dev, 0)]) + data = data.flatten() + data_mapping = {curr_dev: curr_dev.asarray(data, constant=self._constant)} + self._lazy_data = data_mapping - # TODO: CuPy has no support for `writeable` flag - if constant and isinstance(self._data, np.ndarray): - self._data.flags.writeable = False + self._state = collections.defaultdict(lambda: -1, [(curr_dev, 0)]) self.__post_init__() @@ -615,13 +612,9 @@ def _localized(self) -> ArrayBuffer: def sync_devices(self, current_device: Device): last_updated_device = self._last_updated_device - self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device]) + self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device], constant=self.constant) self._state[current_device] = self._state[last_updated_device] - # NOTE: Current fix for CuPy having no `writeable support` or maintaining flags - if self.constant and isinstance(self._lazy_data[current_device], np.ndarray): - self._lazy_data[current_device].flags.writeable = False - def _is_data_available(self, device: Device) -> bool: return device in self._lazy_data diff --git a/pyop3/device.py b/pyop3/device.py index 2fd0810d7f..7fdf2695ca 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -22,7 +22,7 @@ def device_index(self): return self._device_index @abstractmethod - def asarray(self, arr): + def asarray(self, arr, *, constant=False): pass @abstractmethod @@ -43,13 +43,18 @@ def __init__(self, device_index: int | None = None): self._registered_arrays = set() self._device_index = device_index - def asarray(self, arr): + def asarray(self, arr, *, constant=False): # NOTE: Better logic needed if we switch from just NumPy/CuPy + output = arr if not isinstance(arr, np.ndarray): import cupy as cp - return cp.asnumpy(arr) - - return np.array(arr) + output = cp.asnumpy(arr) + else: + output = np.array(output) + + if constant: + output.flags.writeable = False + return output def zeros_like(self, arr): return np.zeros_like(arr) @@ -70,7 +75,7 @@ def __init__(self, device_index: int | None = None): # TODO: Raise No GPU exception raise NotImplementedError - def asarray(self, arr): + def asarray(self, arr, *, constant=False): import cupy as cp return cp.asarray(arr) From 39d0acd084f81b27f74ca4eee52319e263a1d310 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 14:56:21 +0100 Subject: [PATCH 21/36] pr review: removing unused variables --- pyop3/device.py | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/pyop3/device.py b/pyop3/device.py index 7fdf2695ca..64beef918d 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -7,20 +7,11 @@ import numpy as np class Device(metaclass=ABCMeta): - _name: str - _device_index: int | None + name: str - def __init__(self, device_index: int | None = None): + def __init__(self): pass - @property - def name(self): - return self._name - - @property - def device_index(self): - return self._device_index - @abstractmethod def asarray(self, arr, *, constant=False): pass @@ -30,18 +21,16 @@ def zeros_like(self, arr): pass def __repr__(self): - return self._name + return self.name def __str__(self): - return self._name + return self.name class CPU(Device): + name = "CPU" - def __init__(self, device_index: int | None = None): + def __init__(self): super().__init__() - self._name = "cpu" - self._registered_arrays = set() - self._device_index = device_index def asarray(self, arr, *, constant=False): # NOTE: Better logic needed if we switch from just NumPy/CuPy @@ -60,13 +49,10 @@ def zeros_like(self, arr): return np.zeros_like(arr) class CUDAGPU(Device): + name = "CudaGPU" - def __init__(self, device_index: int | None = None): + def __init__(self): super().__init__() - self._name = "CudaGPU" - self._registered_arrays = set() - self._token = None - self._device_index = device_index try: import cupy as cp @@ -83,6 +69,9 @@ def zeros_like(self, arr): import cupy as cp return cp.zeros_like(arr) +HOST_DEVICE = CPU() +_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) + @contextlib.contextmanager def offloading(dev: Device): # TODO: Not Device exception @@ -95,11 +84,5 @@ def offloading(dev: Device): finally: _current_device.reset(token) -# NOTE: Should this const variable be here? -HOST_DEVICE = CPU() - -# NOTE: Use contextvars to act as a bridge between buffer and manager -_current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) - def get_current_device(): return _current_device.get() From e48d69893d6ccb04f0d224e0f5aa5d33b348734d Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 15:07:13 +0100 Subject: [PATCH 22/36] remove dispatch to allow no-import cupy --- pyop3/utils.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/pyop3/utils.py b/pyop3/utils.py index 7f991416af..b8008763af 100644 --- a/pyop3/utils.py +++ b/pyop3/utils.py @@ -337,22 +337,11 @@ def map_when(func, when_func, iterable): else: yield item - -@functools.singledispatch -def readonly(array: Any) -> Any: - raise UnsupportedArrayException - -@readonly.register -def _(array: np.ndarray) -> np.ndarray: - """Return a readonly view of a numpy array.""" - view = array.view() - view.setflags(write=False) - return view - -@readonly.register -def _(array: cp.ndarray) -> cp.ndarray: - """ Return a view of a CuPy array.""" +def readonly(array: np.ndarray | cp.ndarray) -> np.ndarray | cp.ndarray: + """Return a readonly view of a numpy/cupy array.""" view = array.view() + if isinstance(array, np.ndarray): + view.setflags(write=False) return view def debug_assert(predicate, msg=None): From 7d582d6c55ad8b68b98e14880c8a16b721b0c4fe Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 15:17:29 +0100 Subject: [PATCH 23/36] pr: fix property, duplicate, init - using @property for last_updated_device, known by state, does not need to be variable. - duplicate method only copies most up-to-date copy and non-copy duplicate only copies for current device - initialisation adds None as optional for data input --- pyop3/buffer.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 053f739fa1..dc672312bc 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -242,11 +242,8 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): # TODO: Connor and I both dislike defaultdict but I can't think of an alternative atm _state: collections.defaultdict[Device, int] - _last_updated_device: Device - _max_value: np.number | None = None - # flags for tracking parallel correctness _leaves_valid: bool = True _pending_reduction: Callable | None = None @@ -257,7 +254,10 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer, type(self), self._constant, self._rank_equal, self._ordered, self.dtype, buffer_counter[self]) - def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False): + def __init__(self, data: np.ndarray | cp.ndarray | None, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False): + + data = data.flatten() + curr_dev = get_current_device() if sf is None: sf = NullStarForest(data.size) @@ -268,20 +268,13 @@ def __init__(self, data: np.ndarray | cp.ndarray, sf: StarForest | None = None, if rank_equal and not constant: raise ValueError - curr_dev = get_current_device() - self.sf = sf self._name = name self._constant = constant self._rank_equal = rank_equal self._max_value = max_value self._ordered = ordered - self._last_updated_device = curr_dev - - data = data.flatten() - data_mapping = {curr_dev: curr_dev.asarray(data, constant=self._constant)} - self._lazy_data = data_mapping - + self._lazy_data = {curr_dev: curr_dev.asarray(data, constant=self._constant)} self._state = collections.defaultdict(lambda: -1, [(curr_dev, 0)]) self.__post_init__() @@ -320,19 +313,23 @@ def size(self) -> int: def dtype(self) -> np.dtype: return self._data.dtype + @property + def _last_updated_device(self) -> Device: + return max(self.state, key=self.state.get) + def inc_state(self) -> None: curr_dev = get_current_device() self._state[curr_dev] += 1 - self._last_updated_device = curr_dev def duplicate(self, *, copy: bool = False) -> ArrayBuffer: # make sure that there are no pending transfers before we copy self.assemble() name = f"{self.name}_copy" + curr_dev = get_current_device() if copy: - data = {obj: arr.copy() for obj, arr in self._lazy_data.items()} + data = {curr_dev: self._lazy_data[curr_dev]} else: - data = {obj: obj.zeros_like(arr) for obj, arr in self._lazy_data.items()} + data = {curr_dev: curr_dev.zeros_like(self._lazy_data[curr_dev])} return self.__record_init__(_name=name, _lazy_data=data) is_nested: ClassVar[bool] = False @@ -493,8 +490,6 @@ def _data(self): # if self._lazy_data is None: # self._lazy_data = np.zeros(self.shape, dtype=self.dtype) - if self.name == "array_247_buffer": - breakpoint() return self._lazy_data[curr_dev] # TODO: I think the halo bits should only be handled at the Dat level via the From 0528e763bcc1b180bed739f840aac61033892078 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Mon, 4 May 2026 15:48:47 +0100 Subject: [PATCH 24/36] fix: change petsc config version to v3.25.0 - v3.24.5 was failing to compile petsc4py due to PETSc no support for PCPatchSetComputeFunctionExteriorFacets --- pyproject.toml | 2 +- scripts/firedrake-configure | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1e9153c5d5..7b4d9ea1e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -160,7 +160,7 @@ requires = [ "mpi4py; python_version < '3.13'", "numpy", # TODO RELEASE - # "petsc4py==3.24.5", + # "petsc4py==3.25.0", "petsctools", "pkgconfig", "pybind11", diff --git a/scripts/firedrake-configure b/scripts/firedrake-configure index 0c1030808d..2199dbaa87 100755 --- a/scripts/firedrake-configure +++ b/scripts/firedrake-configure @@ -39,7 +39,7 @@ ARCH_DEFAULT = FiredrakeArch.DEFAULT ARCH_COMPLEX = FiredrakeArch.COMPLEX -SUPPORTED_PETSC_VERSION = "v3.24.5" +SUPPORTED_PETSC_VERSION = "v3.25.0" def main(): From e07e6c619060a8eb5c12db97efdf34dbdd0dd22c Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 6 May 2026 11:28:41 +0100 Subject: [PATCH 25/36] include cupy callable pointer --- pyop3/insn/exec.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pyop3/insn/exec.py b/pyop3/insn/exec.py index e7193dadb1..cf570ec4c5 100644 --- a/pyop3/insn/exec.py +++ b/pyop3/insn/exec.py @@ -515,6 +515,14 @@ def _(self, handle: int): # assumes an address def _(self, handle: np.ndarray) -> int: return handle.ctypes.data + try: + import cupy as cp + @_as_exec_argument.register + def _(self, handle: cp.ndarray) -> int: + return handle.data.ptr + except ImportError: + pass + @_as_exec_argument.register def _(self, mat: PETSc.Mat) -> int: # Sometime the matrix is in an invalid state and we cannot return a handle. From 0aea6d0777dd0d50d99f0a717c1c0a666e4439ea Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 6 May 2026 11:54:46 +0100 Subject: [PATCH 26/36] maintain CPU SF comms and cond exec with cupy - Wrappped all SF comms with on_host decorator to ensure happens on host - This increases number of copies but all MPI happening on host atm - Executable requires receiving a pointer to array. Providing a conditional singledispatch register - in case user does not have cupy. --- pyop3/buffer.py | 11 ++++------- pyop3/device.py | 11 +++++++++++ pyop3/insn/exec.py | 2 +- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index dc672312bc..163d230867 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -22,10 +22,8 @@ from pyop3.utils import UniqueNameGenerator, as_tuple, deprecated, maybe_generate_name, readonly from pyop3.device import ( Device, - CUDAGPU, - CPU, - HOST_DEVICE, - get_current_device + get_current_device, + on_host ) from ._buffer_cy import set_petsc_mat_diagonal @@ -224,9 +222,6 @@ def handle(self, *, nest_indices: tuple[tuple[int, ...], ...] = ()) -> Any: """The underlying data structure.""" - -# NOTE: When GPU support is added, the host-device awareness and -# copies should live in this class. @pyop3.record.record() class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): """A buffer whose underlying data structure is a lazily-evaluated NumPy/CuPy array.""" @@ -520,6 +515,7 @@ def _reduction_ops(self): } @not_in_flight + @on_host def reduce_leaves_to_roots(self): self.reduce_leaves_to_roots_begin() self.reduce_leaves_to_roots_end() @@ -548,6 +544,7 @@ def reduce_leaves_to_roots_end(self): self._finalizer = None @not_in_flight + @on_host def broadcast_roots_to_leaves(self): self.broadcast_roots_to_leaves_begin() self.broadcast_roots_to_leaves_end() diff --git a/pyop3/device.py b/pyop3/device.py index 64beef918d..d4b78b5879 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -84,5 +84,16 @@ def offloading(dev: Device): finally: _current_device.reset(token) +def on_host(func): + + def wrapper(*args, **kwargs): + token = _current_device.set(HOST_DEVICE) + try: + return func(*args, **kwargs) + finally: + _current_device.reset(token) + + return wrapper + def get_current_device(): return _current_device.get() diff --git a/pyop3/insn/exec.py b/pyop3/insn/exec.py index cf570ec4c5..64ef1d1d93 100644 --- a/pyop3/insn/exec.py +++ b/pyop3/insn/exec.py @@ -517,7 +517,7 @@ def _(self, handle: np.ndarray) -> int: try: import cupy as cp - @_as_exec_argument.register + @_as_exec_argument.register(cp.ndarray) def _(self, handle: cp.ndarray) -> int: return handle.data.ptr except ImportError: From 6f64d835a7fe4ebc80844f7699c7c9b84492057b Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 6 May 2026 14:28:42 +0100 Subject: [PATCH 27/36] basic GPU unit tests covering context --- pyop3_gpu_demo.py | 16 ++--- tests/pyop3/unit/test_gpu_context.py | 92 ++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 tests/pyop3/unit/test_gpu_context.py diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 027c9a6957..3b3baee6fe 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -21,6 +21,8 @@ from firedrake import * import pyop3 as op3 +from pyop3.device import on_host + # made up API, we need some way to identify the device host = op3.HOST_DEVICE # or similar @@ -55,18 +57,18 @@ k.dat.buffer.duplicate() k.dat.buffer.duplicate(copy=True) - k.dat.data_wo[...] = 1 + k.dat.assign(1, eager=True, eager_strategy="array") + + k.dat.data_rw[...] = 3 # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 0 # untouched assert g.dat.buffer.state[gpu] == 1 # modified once - assert k.dat.buffer.state[host] == -1 # not created - assert k.dat.buffer.state[gpu] == 1 # modified once + assert k.dat.buffer.state[host] == 2 # created for mpi sharing + assert k.dat.buffer.state[gpu] == 2 # modified -# print(f"{g.dat.buffer._lazy_data=}") -# print(f"{f.dat.buffer._lazy_data=}") assert isinstance(f.dat.data_ro, np.ndarray) assert isinstance(g.dat.data_ro, np.ndarray) assert (g.dat.data_ro == 23).all() @@ -77,5 +79,5 @@ assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 1 # matches device assert g.dat.buffer.state[gpu] == 1 # modified once -assert k.dat.buffer.state[host] == 1 # modified once -assert k.dat.buffer.state[gpu] == 1 # modified once +assert k.dat.buffer.state[host] == 2 # modified twice +assert k.dat.buffer.state[gpu] == 2 # modified twice diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py new file mode 100644 index 0000000000..484ad81cd7 --- /dev/null +++ b/tests/pyop3/unit/test_gpu_context.py @@ -0,0 +1,92 @@ +import pytest +import numpy as np + +try: + import cupy as cp +except ImportError as err: + pytest.exit("CuPy not available, skipping GPU tests...") + + +import pyop3 as op3 +from firedrake import Function, FunctionSpace, UnitSquareMesh + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +HOST = op3.HOST_DEVICE +CUDAGPU = op3.CUDAGPU() + +STATE_NOT_CREATED = -1 +STATE_UNTOUCHED = 0 +STATE_MODIFIED = 1 + +@pytest.fixture(scope="module") +def mesh(): + return UnitSquareMesh(3, 3) + +@pytest.fixture(scope="module") +def function_space(mesh): + return FunctionSpace(mesh, "P", 2) + +@pytest.fixture(scope="module") +def f(function_space): + return Function(function_space) + +@pytest.fixture(scope="module") +def g(function_space): + return Function(function_space) + +def state(func, device): + """Shorthand for reading buffer state on a given device.""" + return func.dat.buffer.state[device] + +class TestInitialState: + def test_host_data_is_numpy(self, f): + assert isinstance(f.dat.data_ro, np.ndarray) + + def test_host_state_modified(self, f): + """Assign affects buffer counter on host""" + f.dat.assign(10, eager=True, eager_strategy="array") + assert state(f, HOST) == 1 + + def test_gpu_state_not_created(self, f): + """CUDAGPU buffer should not exist before any offloading.""" + assert state(f, CUDAGPU) == STATE_NOT_CREATED + +class TestOffloadingArrayTypes: + """Inside op3.offloading, data array type should be GPU array types""" + + def test_buffer_evaluates_cupy_on_cudagpu(self): + mesh = UnitSquareMesh(3, 3) + V = FunctionSpace(mesh, "P", 2) + + f = Function(V).assign(10) + g = Function(V) + with op3.offloading(CUDAGPU): + assert not isinstance(f.dat.data_ro, np.ndarray) + + def test_buffer_creation_on_cudagpu(self): + mesh = UnitSquareMesh(3, 3) + V = FunctionSpace(mesh, "P", 2) + + f = Function(V).assign(10) + g = Function(V) + with op3.offloading(CUDAGPU): + k = Function(V) + assert not isinstance(k.dat.data_ro, np.ndarray) + +class TestOffloadingAssignmentState: + + @pytest.fixture(autouse=True) + def _run_gpu_assign(self, f, g): + with op3.offloading(CUDAGPU): + g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") + + def test_host_state_untouched_after_gpu_assign(self, g): + """g was not modified on host""" + assert state(g, HOST) == 0 + + def test_g_gpu_state_modified_after_assign(self, g): + assert state(g, CUDAGPU) == 1 From c8db3e20e5e65c85293e0c00b189e452a088c4da Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 6 May 2026 18:07:27 +0100 Subject: [PATCH 28/36] test cases for gpu - no fixtures due to bug --- tests/pyop3/unit/test_gpu_context.py | 71 ++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 20 deletions(-) diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py index 484ad81cd7..69ab4f9752 100644 --- a/tests/pyop3/unit/test_gpu_context.py +++ b/tests/pyop3/unit/test_gpu_context.py @@ -11,10 +11,6 @@ from firedrake import Function, FunctionSpace, UnitSquareMesh -# --------------------------------------------------------------------------- -# Fixtures -# --------------------------------------------------------------------------- - HOST = op3.HOST_DEVICE CUDAGPU = op3.CUDAGPU() @@ -22,21 +18,21 @@ STATE_UNTOUCHED = 0 STATE_MODIFIED = 1 -@pytest.fixture(scope="module") +@pytest.fixture() def mesh(): return UnitSquareMesh(3, 3) -@pytest.fixture(scope="module") -def function_space(mesh): +@pytest.fixture() +def V(mesh): return FunctionSpace(mesh, "P", 2) -@pytest.fixture(scope="module") -def f(function_space): - return Function(function_space) +@pytest.fixture() +def f(V): + return Function(V) -@pytest.fixture(scope="module") -def g(function_space): - return Function(function_space) +@pytest.fixture() +def g(V): + return Function(V) def state(func, device): """Shorthand for reading buffer state on a given device.""" @@ -55,6 +51,8 @@ def test_gpu_state_not_created(self, f): """CUDAGPU buffer should not exist before any offloading.""" assert state(f, CUDAGPU) == STATE_NOT_CREATED +# NOTE: `pytest.fixture`s not used for Offloading GPU tests due to segfault +# Unsure what is causing but we are leaving for now. class TestOffloadingArrayTypes: """Inside op3.offloading, data array type should be GPU array types""" @@ -78,15 +76,48 @@ def test_buffer_creation_on_cudagpu(self): assert not isinstance(k.dat.data_ro, np.ndarray) class TestOffloadingAssignmentState: - - @pytest.fixture(autouse=True) - def _run_gpu_assign(self, f, g): - with op3.offloading(CUDAGPU): - g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") - def test_host_state_untouched_after_gpu_assign(self, g): + def test_host_state_untouched_after_gpu_assign(self): """g was not modified on host""" + mesh = UnitSquareMesh(3, 3) + V = FunctionSpace(mesh, "P", 2) + + f = Function(V).assign(10) + g = Function(V) + with op3.offloading(CUDAGPU): + g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") assert state(g, HOST) == 0 - def test_g_gpu_state_modified_after_assign(self, g): + def test_gpu_state_modified_after_assign(self): + mesh = UnitSquareMesh(3, 3) + V = FunctionSpace(mesh, "P", 2) + + f = Function(V).assign(10) + g = Function(V) + with op3.offloading(CUDAGPU): + g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") assert state(g, CUDAGPU) == 1 + +class TestOffloadingArraysUpdated: + + def test_gpu_array_modified(self): + '''Data on GPU is updated in GPU context''' + mesh = UnitSquareMesh(3, 3) + V = FunctionSpace(mesh, "P", 2) + + f = Function(V).assign(10) + g = Function(V) + with op3.offloading(CUDAGPU): + g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") + assert (g.dat.data_ro == 23).all() + + def test_gpu_array_modified(self): + ''' Data on CPU is updated when in CPU context''' + mesh = UnitSquareMesh(3, 3) + V = FunctionSpace(mesh, "P", 2) + + f = Function(V).assign(10) + g = Function(V) + with op3.offloading(CUDAGPU): + g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") + assert (g.dat.data_ro == 23).all() From 7aa8425d9c096f9b70ed3e6cfa0b1611803acc2c Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 6 May 2026 18:08:37 +0100 Subject: [PATCH 29/36] fixed bug by amending record_modified state bug regarding: if an array is modified as the first action when offloaded: 1. state for new device is updated and incremented (as modifying) 2. then buffer attempts to sync from most up-to-date device 3. this may be current device as state was incremented first 4. then device tries to copy non-existent array to itself - bad solution: adapt record-modified decorator to increment state after wrapped function - simple try-finally wrapped trick --- pyop3/buffer.py | 8 +++++--- pyop3_gpu_demo.py | 11 ++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 163d230867..6a89f6f8ad 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -61,8 +61,10 @@ def wrapper(self, *args, **kwargs): def record_modified(func): def wrapper(self, *args, **kwargs): assert not self.constant - self.inc_state() - return func(self, *args, **kwargs) + try: + return func(self, *args, **kwargs) + finally: + self.inc_state() return wrapper @@ -314,7 +316,7 @@ def _last_updated_device(self) -> Device: def inc_state(self) -> None: curr_dev = get_current_device() - self._state[curr_dev] += 1 + self.state[curr_dev] = self.state.get(curr_dev, 0) + 1 def duplicate(self, *, copy: bool = False) -> ArrayBuffer: # make sure that there are no pending transfers before we copy diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 3b3baee6fe..6414718467 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -57,8 +57,6 @@ k.dat.buffer.duplicate() k.dat.buffer.duplicate(copy=True) - k.dat.assign(1, eager=True, eager_strategy="array") - k.dat.data_rw[...] = 3 # state tracking checks @@ -66,18 +64,17 @@ assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 0 # untouched assert g.dat.buffer.state[gpu] == 1 # modified once - assert k.dat.buffer.state[host] == 2 # created for mpi sharing - assert k.dat.buffer.state[gpu] == 2 # modified + assert k.dat.buffer.state[host] == -1 # not created + assert k.dat.buffer.state[gpu] == 1 # modified assert isinstance(f.dat.data_ro, np.ndarray) assert isinstance(g.dat.data_ro, np.ndarray) assert (g.dat.data_ro == 23).all() -assert (k.dat.data_ro == 1).all() # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 1 # matches device assert g.dat.buffer.state[gpu] == 1 # modified once -assert k.dat.buffer.state[host] == 2 # modified twice -assert k.dat.buffer.state[gpu] == 2 # modified twice +assert k.dat.buffer.state[host] == -1 # not created +assert k.dat.buffer.state[gpu] == 1 # modified once From 5be4a8c549a15f3c4a39d8703c2399f73b8ff66b Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 12 May 2026 13:06:11 +0100 Subject: [PATCH 30/36] tests and fix: cached properties on gpu basic unit tests introduced for gpu context as ground truth bug: if AxisTree for FunctionSpace (or sim. object) not cached until on device, it attempts to `compile` - this is not implemented for GPU yet. fix: this has been fixed with some patches for now, to be removed. - `firedrake/functionspaceimpl::make_dat` is wrapped in on_host - `pyop3/tree/axis_tree/tree.py::_buffer_indices` is wrapped in on_host - `pyop3/buffer.py` has necessary sync in duplicate (in scenario that user assigns and immediately copies) With current fix, this does mean that if the AxisTree properties are not cached, the assign operation will happen on CPU - consequent calls work on GPU. --- firedrake/functionspaceimpl.py | 3 + pyop3/buffer.py | 9 ++- pyop3/insn/exec.py | 4 +- pyop3/tree/axis_tree/tree.py | 5 ++ pyop3_gpu_demo.py | 11 ++-- tests/pyop3/unit/test_gpu_context.py | 98 ++++++++++++++-------------- 6 files changed, 74 insertions(+), 56 deletions(-) diff --git a/firedrake/functionspaceimpl.py b/firedrake/functionspaceimpl.py index 53fd9308ee..1fe2451417 100644 --- a/firedrake/functionspaceimpl.py +++ b/firedrake/functionspaceimpl.py @@ -31,6 +31,7 @@ from pyop3 import mpi from pyop3.utils import just_one, single_valued from pyop3.cache import cached_on, with_heavy_caches, cached_method +from pyop3.device import on_host from finat.quadrature import QuadratureRule from ufl.cell import CellSequence @@ -1412,7 +1413,9 @@ def dim(self): See also :attr:`FunctionSpace.dof_count` and :attr:`FunctionSpace.node_count` .""" return self.template_vec.getSize() + # TODO: `on_host` decorator only exists while `compile` strategy does not work on device @_with_mesh_heavy_cache + @on_host def make_dat(self, val=None, valuetype=None, name=None): """Return a new Dat storing DoFs for the function space.""" if val is not None: diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 6a89f6f8ad..18f39283cb 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -323,6 +323,12 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: self.assemble() name = f"{self.name}_copy" curr_dev = get_current_device() + + # TODO: Fix for first-assign, immediate duplicate bug + # This can be removed once `compile` strategy works on device + if curr_dev not in self._lazy_data: + self.sync_devices(curr_dev) + if copy: data = {curr_dev: self._lazy_data[curr_dev]} else: @@ -332,7 +338,7 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: is_nested: ClassVar[bool] = False @property - def handle(self) -> np.ndarray: + def handle(self) -> np.ndarray | cp.ndarray: return self._data @property @@ -486,7 +492,6 @@ def _data(self): # NOTE: If data is None, set to zeros? # if self._lazy_data is None: # self._lazy_data = np.zeros(self.shape, dtype=self.dtype) - return self._lazy_data[curr_dev] # TODO: I think the halo bits should only be handled at the Dat level via the diff --git a/pyop3/insn/exec.py b/pyop3/insn/exec.py index 64ef1d1d93..5514f7b158 100644 --- a/pyop3/insn/exec.py +++ b/pyop3/insn/exec.py @@ -517,9 +517,11 @@ def _(self, handle: np.ndarray) -> int: try: import cupy as cp + # NOTE: This gives a pointer to a GPU memory address. + # Loopy cannot work with GPU so this will lead to a segfault. @_as_exec_argument.register(cp.ndarray) def _(self, handle: cp.ndarray) -> int: - return handle.data.ptr + raise MemoryError("SegFault will occur if you pass a CuPy GPU pointer to Loopy/C code") except ImportError: pass diff --git a/pyop3/tree/axis_tree/tree.py b/pyop3/tree/axis_tree/tree.py index a60dab9d76..5d5781ae8a 100644 --- a/pyop3/tree/axis_tree/tree.py +++ b/pyop3/tree/axis_tree/tree.py @@ -64,6 +64,8 @@ strictly_all, ) +from pyop3.device import on_host + if typing.TYPE_CHECKING: from pyop3.expr import LinearDatBufferExpression @@ -1641,7 +1643,10 @@ def materialize(self): # TODO: how do we know if buffer_slice will produce the same object across all ranks? # Need to make forming a slice or a subset an active decision! + + # TODO: on_host decorator only required while `compile` strategy does not work for device offloading @cached_property + @on_host def _buffer_indices(self) -> np.ndarray[IntType]: from pyop3 import Dat, do_loop diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py index 6414718467..0c856e509f 100644 --- a/pyop3_gpu_demo.py +++ b/pyop3_gpu_demo.py @@ -48,11 +48,11 @@ assert not isinstance(f.dat.data_ro, np.ndarray) assert not isinstance(g.dat.data_ro, np.ndarray) - # Do the assignment using array operations + # # Do the assignment using array operations g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="array") - # Do the assignment using MLIR (this is a later step) - # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile") + # # Do the assignment using MLIR (this is a later step) + # # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile") k = Function(V) k.dat.buffer.duplicate() k.dat.buffer.duplicate(copy=True) @@ -64,17 +64,18 @@ assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 0 # untouched assert g.dat.buffer.state[gpu] == 1 # modified once - assert k.dat.buffer.state[host] == -1 # not created + assert k.dat.buffer.state[host] == 0 # not modified assert k.dat.buffer.state[gpu] == 1 # modified assert isinstance(f.dat.data_ro, np.ndarray) assert isinstance(g.dat.data_ro, np.ndarray) assert (g.dat.data_ro == 23).all() +assert (k.dat.data_ro == 3).all() # state tracking checks assert f.dat.buffer.state[host] == 1 # modified once assert f.dat.buffer.state[gpu] == 1 # matches host assert g.dat.buffer.state[host] == 1 # matches device assert g.dat.buffer.state[gpu] == 1 # modified once -assert k.dat.buffer.state[host] == -1 # not created +assert k.dat.buffer.state[host] == 1 # matches device assert k.dat.buffer.state[gpu] == 1 # modified once diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py index 69ab4f9752..1f59580d07 100644 --- a/tests/pyop3/unit/test_gpu_context.py +++ b/tests/pyop3/unit/test_gpu_context.py @@ -23,101 +23,103 @@ def mesh(): return UnitSquareMesh(3, 3) @pytest.fixture() -def V(mesh): +def FuncSpace(mesh): return FunctionSpace(mesh, "P", 2) @pytest.fixture() -def f(V): - return Function(V) +def f(FuncSpace): + return Function(FuncSpace).assign(10) @pytest.fixture() -def g(V): - return Function(V) +def g(FuncSpace): + return Function(FuncSpace) def state(func, device): """Shorthand for reading buffer state on a given device.""" return func.dat.buffer.state[device] + class TestInitialState: def test_host_data_is_numpy(self, f): assert isinstance(f.dat.data_ro, np.ndarray) def test_host_state_modified(self, f): """Assign affects buffer counter on host""" + old_state = state(f, HOST) f.dat.assign(10, eager=True, eager_strategy="array") - assert state(f, HOST) == 1 + assert state(f, HOST) == old_state + 1 def test_gpu_state_not_created(self, f): """CUDAGPU buffer should not exist before any offloading.""" assert state(f, CUDAGPU) == STATE_NOT_CREATED -# NOTE: `pytest.fixture`s not used for Offloading GPU tests due to segfault -# Unsure what is causing but we are leaving for now. class TestOffloadingArrayTypes: """Inside op3.offloading, data array type should be GPU array types""" - def test_buffer_evaluates_cupy_on_cudagpu(self): - mesh = UnitSquareMesh(3, 3) - V = FunctionSpace(mesh, "P", 2) - - f = Function(V).assign(10) - g = Function(V) + def test_buffer_evaluates_cupy_on_cudagpu(self, FuncSpace): + f = Function(FuncSpace).assign(10) with op3.offloading(CUDAGPU): - assert not isinstance(f.dat.data_ro, np.ndarray) - - def test_buffer_creation_on_cudagpu(self): - mesh = UnitSquareMesh(3, 3) - V = FunctionSpace(mesh, "P", 2) + assert isinstance(f.dat.data_ro, cp.ndarray) - f = Function(V).assign(10) - g = Function(V) + def test_buffer_creation_on_cudagpu(self, FuncSpace): with op3.offloading(CUDAGPU): - k = Function(V) - assert not isinstance(k.dat.data_ro, np.ndarray) + k = Function(FuncSpace) + assert isinstance(k.dat.data_ro, cp.ndarray) class TestOffloadingAssignmentState: - def test_host_state_untouched_after_gpu_assign(self): + def test_host_state_untouched_after_gpu_assign(self, f, g): """g was not modified on host""" - mesh = UnitSquareMesh(3, 3) - V = FunctionSpace(mesh, "P", 2) - - f = Function(V).assign(10) - g = Function(V) with op3.offloading(CUDAGPU): g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") assert state(g, HOST) == 0 - def test_gpu_state_modified_after_assign(self): - mesh = UnitSquareMesh(3, 3) - V = FunctionSpace(mesh, "P", 2) - - f = Function(V).assign(10) - g = Function(V) + def test_gpu_state_modified_after_assign(self, f, g): + """g was modified on CUDAGPU""" with op3.offloading(CUDAGPU): g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") assert state(g, CUDAGPU) == 1 class TestOffloadingArraysUpdated: - def test_gpu_array_modified(self): + def test_gpu_array_modified(self, g): '''Data on GPU is updated in GPU context''' - mesh = UnitSquareMesh(3, 3) - V = FunctionSpace(mesh, "P", 2) - - f = Function(V).assign(10) - g = Function(V) with op3.offloading(CUDAGPU): - g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") + g.dat.assign(23, eager=True, eager_strategy="array") assert (g.dat.data_ro == 23).all() - def test_gpu_array_modified(self): + def test_gpu_array_modified_copied_to_host(self, g): ''' Data on CPU is updated when in CPU context''' - mesh = UnitSquareMesh(3, 3) - V = FunctionSpace(mesh, "P", 2) + with op3.offloading(CUDAGPU): + g.dat.assign(23, eager=True, eager_strategy="array") + assert (g.dat.data_ro == 23).all() - f = Function(V).assign(10) - g = Function(V) + def test_gpu_data_wo_copied_to_host(self, g): + ''' Data on CPU is updated when in CPU context''' with op3.offloading(CUDAGPU): - g.dat.assign(2 * f.dat + 3, eager=True, eager_strategy="array") + g.dat.data_wo[...] = 23 assert (g.dat.data_ro == 23).all() + +class TestDeviceArrayDuplication: + + def test_duplicate_not_same(self, FuncSpace): + """Duplicate buffer is not same object""" + with op3.offloading(CUDAGPU): + k = Function(FuncSpace) + k_dup_buffer = k.dat.buffer.duplicate() + assert type(k_dup_buffer) == type(k.dat.buffer) + assert not k_dup_buffer is k.dat.buffer + + def test_duplicate_to_device(self, FuncSpace): + """ Buffer maintains device context when copied""" + with op3.offloading(CUDAGPU): + k = Function(FuncSpace) + k_dup_buffer = k.dat.buffer.duplicate() + assert isinstance(k_dup_buffer.data_ro, cp.ndarray) + + def test_duplicate_copy_to_device(self, FuncSpace): + """ Buffer maintains device context when exact copy""" + with op3.offloading(CUDAGPU): + k = Function(FuncSpace) + k_dup_buffer = k.dat.buffer.duplicate(copy=True) + assert isinstance(k_dup_buffer.data_ro, cp.ndarray) From f13342a61f2803143bc3e1a45ae7865a59d32296 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 12 May 2026 13:27:04 +0100 Subject: [PATCH 31/36] cleaning comments --- pyop3/buffer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index 18f39283cb..f1953da0d0 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -237,7 +237,6 @@ class ArrayBuffer(AbstractArrayBuffer, ConcreteBuffer): _rank_equal: bool _ordered: bool - # TODO: Connor and I both dislike defaultdict but I can't think of an alternative atm _state: collections.defaultdict[Device, int] _max_value: np.number | None = None @@ -489,9 +488,6 @@ def _data(self): if not self._is_data_available(curr_dev) or not self._is_data_synced(curr_dev): self.sync_devices(curr_dev) - # NOTE: If data is None, set to zeros? - # if self._lazy_data is None: - # self._lazy_data = np.zeros(self.shape, dtype=self.dtype) return self._lazy_data[curr_dev] # TODO: I think the halo bits should only be handled at the Dat level via the From 5fbbb8dd030b8ad3e023f91943b9c0c7e65f8d24 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 12 May 2026 16:23:09 +0100 Subject: [PATCH 32/36] pr: resolving comments and adding docstrings --- pyop3/buffer.py | 28 ++++++++++++------ pyop3/device.py | 76 +++++++++++++++++++++++++++++++++---------------- pyop3/utils.py | 4 +-- 3 files changed, 72 insertions(+), 36 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index f1953da0d0..d53833d4f3 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -250,7 +250,16 @@ def instruction_executor_cache_key(self, buffer_counter: Mapping[AbstractBuffer, type(self), self._constant, self._rank_equal, self._ordered, self.dtype, buffer_counter[self]) - def __init__(self, data: np.ndarray | cp.ndarray | None, sf: StarForest | None = None, *, name: str|None=None,prefix:str|None=None,constant:bool=False, rank_equal: bool = False, max_value: numbers.Number | None=None, ordered:bool=False): + def __init__( + self, data: np.ndarray | cp.ndarray | None, + sf: StarForest | None = None, *, + name: str|None=None, + prefix:str|None=None, + constant:bool=False, + rank_equal: bool = False, + max_value: numbers.Number | None=None, + ordered:bool=False + ): data = data.flatten() curr_dev = get_current_device() @@ -326,7 +335,7 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: # TODO: Fix for first-assign, immediate duplicate bug # This can be removed once `compile` strategy works on device if curr_dev not in self._lazy_data: - self.sync_devices(curr_dev) + self.sync_devices() if copy: data = {curr_dev: self._lazy_data[curr_dev]} @@ -485,8 +494,8 @@ def leaves_valid(self) -> bool: def _data(self): curr_dev = get_current_device() - if not self._is_data_available(curr_dev) or not self._is_data_synced(curr_dev): - self.sync_devices(curr_dev) + if not self._is_data_available_and_synced(curr_dev): + self.sync_devices() return self._lazy_data[curr_dev] @@ -604,17 +613,18 @@ def localize(self) -> ArrayBuffer: def _localized(self) -> ArrayBuffer: return self.__record_init__(sf=None) - def sync_devices(self, current_device: Device): + def sync_devices(self): last_updated_device = self._last_updated_device + current_device = get_current_device() self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device], constant=self.constant) self._state[current_device] = self._state[last_updated_device] - def _is_data_available(self, device: Device) -> bool: - return device in self._lazy_data + def _is_data_available_and_synced(self, device: Device) -> bool: + is_available = device in self._lazy_data + is_synced = self.state[device] == max(self.state.values()) + return is_available and is_synced - def _is_data_synced(self, device: Device) -> bool: - return self.state[device] == max(self.state.values()) class MatBufferSpec(abc.ABC): pass diff --git a/pyop3/device.py b/pyop3/device.py index d4b78b5879..c0bf3703af 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -7,11 +7,13 @@ import numpy as np class Device(metaclass=ABCMeta): + """ + Device - Abstract class + - Base for future GPU implementations + - All device-specific logic should be kept in here + """ name: str - def __init__(self): - pass - @abstractmethod def asarray(self, arr, *, constant=False): pass @@ -27,53 +29,77 @@ def __str__(self): return self.name class CPU(Device): + """ + CPU Class, designed to be host object. + - Plausible to have multiple CPUs, functionally similar to having GPU + """ name = "CPU" - def __init__(self): - super().__init__() - def asarray(self, arr, *, constant=False): - # NOTE: Better logic needed if we switch from just NumPy/CuPy - output = arr - if not isinstance(arr, np.ndarray): + """ Convert GPU/CuPy/NumPy input array to CPU-compliant NumPy array """ + try: import cupy as cp + except ImportError: + cp = None + + if cp and isinstance(arr, cp.ndarray): output = cp.asnumpy(arr) + elif isinstance(arr, np.ndarray): + output = np.array(arr) + if constant: + output.flags.writeable = False else: - output = np.array(output) + raise TypeError(f"{type(arr)} not supported.") - if constant: - output.flags.writeable = False return output + def zeros_like(self, arr): return np.zeros_like(arr) class CUDAGPU(Device): + """ + GPU class for Nvidia GPUs + - All offloading will be done through CuPy + - Multiple instantiations will be independent of each other + """ name = "CudaGPU" def __init__(self): - super().__init__() - try: - import cupy as cp - assert cp.is_available() + assert self.cp.is_available() except: # TODO: Raise No GPU exception raise NotImplementedError - def asarray(self, arr, *, constant=False): + @property + def cp(self): import cupy as cp - return cp.asarray(arr) + return cp + + def asarray(self, arr, *, constant=False): + return self.cp.asarray(arr) def zeros_like(self, arr): - import cupy as cp - return cp.zeros_like(arr) + return self.cp.zeros_like(arr) HOST_DEVICE = CPU() + +""" + Global context variable for determining device context + - This should not be imported to other modules - value accessed through getter + - All modification should be controlled via the offloading function. +""" _current_device = contextvars.ContextVar("current_device", default=HOST_DEVICE) @contextlib.contextmanager def offloading(dev: Device): + """ + Context Manager for offloading components to select device + - Controls global _current_device variable + - Stores former device in `token` to allow stacking of context changes + """ + # TODO: Not Device exception if not isinstance(dev, Device): raise NotImplementedError @@ -85,13 +111,13 @@ def offloading(dev: Device): _current_device.reset(token) def on_host(func): - + """ + Decorator for components that we want to stay on host device + i.e. MPI communications/StarForest + """ def wrapper(*args, **kwargs): - token = _current_device.set(HOST_DEVICE) - try: + with offloading(HOST_DEVICE): return func(*args, **kwargs) - finally: - _current_device.reset(token) return wrapper diff --git a/pyop3/utils.py b/pyop3/utils.py index b8008763af..b9276d0c82 100644 --- a/pyop3/utils.py +++ b/pyop3/utils.py @@ -29,7 +29,7 @@ ndarray_types = [np.ndarray,] try: import cupy as cp - ndarray_types = [np.ndarray, cp.ndarray] + ndarray_types.append(cp.ndarray) except ImportError: pass @@ -584,7 +584,7 @@ def pretty_type(obj: Any) -> str: def safe_equals(a, b, /) -> bool: if any(isinstance(x, tuple(ndarray_types)) for x in [a, b]): return (a == b).all() - if any(isinstance(x, dict) for x in [a, b]): + if any(isinstance(x, Mapping) for x in [a, b]): if a.keys() != b.keys(): return False return all(safe_equals(a[k], b[k]) for k in a) From 1e7755f9d8a2eb6fa65fb154c7c461040b98a129 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 12 May 2026 16:25:50 +0100 Subject: [PATCH 33/36] pr: clean tests and remove gpu demo --- tests/pyop3/unit/test_gpu_context.py | 33 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py index 1f59580d07..51306abad1 100644 --- a/tests/pyop3/unit/test_gpu_context.py +++ b/tests/pyop3/unit/test_gpu_context.py @@ -4,7 +4,7 @@ try: import cupy as cp except ImportError as err: - pytest.exit("CuPy not available, skipping GPU tests...") + pytest.skip(allow_module_level=True, reason="CuPy not available, skipping GPU tests...") import pyop3 as op3 @@ -23,16 +23,16 @@ def mesh(): return UnitSquareMesh(3, 3) @pytest.fixture() -def FuncSpace(mesh): +def space(mesh): return FunctionSpace(mesh, "P", 2) @pytest.fixture() -def f(FuncSpace): - return Function(FuncSpace).assign(10) +def f(space): + return Function(space).assign(10) @pytest.fixture() -def g(FuncSpace): - return Function(FuncSpace) +def g(space): + return Function(space) def state(func, device): """Shorthand for reading buffer state on a given device.""" @@ -56,14 +56,14 @@ def test_gpu_state_not_created(self, f): class TestOffloadingArrayTypes: """Inside op3.offloading, data array type should be GPU array types""" - def test_buffer_evaluates_cupy_on_cudagpu(self, FuncSpace): - f = Function(FuncSpace).assign(10) + def test_buffer_evaluates_cupy_on_cudagpu(self, space): + f = Function(space).assign(10) with op3.offloading(CUDAGPU): assert isinstance(f.dat.data_ro, cp.ndarray) - def test_buffer_creation_on_cudagpu(self, FuncSpace): + def test_buffer_creation_on_cudagpu(self, space): with op3.offloading(CUDAGPU): - k = Function(FuncSpace) + k = Function(space) assert isinstance(k.dat.data_ro, cp.ndarray) class TestOffloadingAssignmentState: @@ -102,24 +102,25 @@ def test_gpu_data_wo_copied_to_host(self, g): class TestDeviceArrayDuplication: - def test_duplicate_not_same(self, FuncSpace): + def test_duplicate_not_same(self, space): """Duplicate buffer is not same object""" with op3.offloading(CUDAGPU): - k = Function(FuncSpace) + k = Function(space) k_dup_buffer = k.dat.buffer.duplicate() assert type(k_dup_buffer) == type(k.dat.buffer) assert not k_dup_buffer is k.dat.buffer - def test_duplicate_to_device(self, FuncSpace): + def test_duplicate_to_device(self, space): """ Buffer maintains device context when copied""" with op3.offloading(CUDAGPU): - k = Function(FuncSpace) + k = Function(space) k_dup_buffer = k.dat.buffer.duplicate() assert isinstance(k_dup_buffer.data_ro, cp.ndarray) - def test_duplicate_copy_to_device(self, FuncSpace): + def test_duplicate_copy_to_device(self, space): """ Buffer maintains device context when exact copy""" with op3.offloading(CUDAGPU): - k = Function(FuncSpace) + k = Function(space) k_dup_buffer = k.dat.buffer.duplicate(copy=True) assert isinstance(k_dup_buffer.data_ro, cp.ndarray) + assert k_dup_buffer._data is k.dat.buffer._data From 891e2c7ff5b2c4fd2429de8a9a1515e6a58281ba Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 12 May 2026 16:27:48 +0100 Subject: [PATCH 34/36] removing gpu demo --- pyop3_gpu_demo.py | 81 ----------------------------------------------- 1 file changed, 81 deletions(-) delete mode 100644 pyop3_gpu_demo.py diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py deleted file mode 100644 index 0c856e509f..0000000000 --- a/pyop3_gpu_demo.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Useful links: - - * https://github.com/firedrakeproject/firedrake/blob/main/.github/workflows/core.yml#L476 - - How to build a GPU-enabled Firedrake. - - * https://github.com/firedrakeproject/firedrake/blob/connorjward/pyop3-gpu/pyop3/device.py - - An implementation of the 'device' context manager. It needs a big refactor. - - * https://github.com/OP2/PyOP2/pull/691/changes#diff-f8765d963b5adb1788f453e259d8cd45f29cee9670563ddb99b9fe2bba115a12 - - Using a wrapper type to track changes between host and device. In pyop3 - this would be the 'ArrayBuffer' object and link into existing - state tracking. -""" - -import numpy as np - -from firedrake import * -import pyop3 as op3 - -from pyop3.device import on_host - - -# made up API, we need some way to identify the device -host = op3.HOST_DEVICE # or similar -gpu = op3.CUDAGPU() - -mesh = UnitSquareMesh(3, 3) -V = FunctionSpace(mesh, "P", 2) - -f = Function(V).assign(10) -g = Function(V) - -assert isinstance(f.dat.data_ro, np.ndarray) -assert isinstance(g.dat.data_ro, np.ndarray) - -# state tracking checks, .buffer.state is now device-specific -assert f.dat.buffer.state[host] == 1 # modified once -assert f.dat.buffer.state[gpu] == -1 # not created -assert g.dat.buffer.state[host] == 0 # untouched -assert g.dat.buffer.state[gpu] == -1 # not created - -with op3.offloading(gpu): - # Getting the .data attribute on the GPU should give us back a GPU array type - assert not isinstance(f.dat.data_ro, np.ndarray) - assert not isinstance(g.dat.data_ro, np.ndarray) - - # # Do the assignment using array operations - g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="array") - - # # Do the assignment using MLIR (this is a later step) - # # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile") - k = Function(V) - k.dat.buffer.duplicate() - k.dat.buffer.duplicate(copy=True) - - k.dat.data_rw[...] = 3 - - # state tracking checks - assert f.dat.buffer.state[host] == 1 # modified once - assert f.dat.buffer.state[gpu] == 1 # matches host - assert g.dat.buffer.state[host] == 0 # untouched - assert g.dat.buffer.state[gpu] == 1 # modified once - assert k.dat.buffer.state[host] == 0 # not modified - assert k.dat.buffer.state[gpu] == 1 # modified - -assert isinstance(f.dat.data_ro, np.ndarray) -assert isinstance(g.dat.data_ro, np.ndarray) -assert (g.dat.data_ro == 23).all() -assert (k.dat.data_ro == 3).all() - -# state tracking checks -assert f.dat.buffer.state[host] == 1 # modified once -assert f.dat.buffer.state[gpu] == 1 # matches host -assert g.dat.buffer.state[host] == 1 # matches device -assert g.dat.buffer.state[gpu] == 1 # modified once -assert k.dat.buffer.state[host] == 1 # matches device -assert k.dat.buffer.state[gpu] == 1 # modified once From 5bd5243a8f1fb05a4015eb40d57bfb8429216a7a Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Tue, 12 May 2026 16:35:35 +0100 Subject: [PATCH 35/36] more descriptive docstring --- pyop3/device.py | 18 ++++++++--- pyop3_gpu_demo.py | 81 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 pyop3_gpu_demo.py diff --git a/pyop3/device.py b/pyop3/device.py index c0bf3703af..1308b31ede 100644 --- a/pyop3/device.py +++ b/pyop3/device.py @@ -30,7 +30,7 @@ def __str__(self): class CPU(Device): """ - CPU Class, designed to be host object. + CPU Class, designed to be host object, inheriting Device - Plausible to have multiple CPUs, functionally similar to having GPU """ name = "CPU" @@ -59,7 +59,7 @@ def zeros_like(self, arr): class CUDAGPU(Device): """ - GPU class for Nvidia GPUs + GPU class for Nvidia GPUs. inheriting Device. - All offloading will be done through CuPy - Multiple instantiations will be independent of each other """ @@ -96,8 +96,18 @@ def zeros_like(self, arr): def offloading(dev: Device): """ Context Manager for offloading components to select device - - Controls global _current_device variable - - Stores former device in `token` to allow stacking of context changes + This function should be the only way to modfiy the current device variable + + Updates current context to the given `dev` variable. + Former device is stored in stack, to be restored when finished + - This also allows for stacking of context windows + + Context variables are also async safe. + + --- Example: + gpu = op3.CUDAGPU() + with op3.offloading(gpu): + g.dat.assign(23, eager=True, eager_strategy="array") """ # TODO: Not Device exception diff --git a/pyop3_gpu_demo.py b/pyop3_gpu_demo.py new file mode 100644 index 0000000000..0c856e509f --- /dev/null +++ b/pyop3_gpu_demo.py @@ -0,0 +1,81 @@ +""" +Useful links: + + * https://github.com/firedrakeproject/firedrake/blob/main/.github/workflows/core.yml#L476 + + How to build a GPU-enabled Firedrake. + + * https://github.com/firedrakeproject/firedrake/blob/connorjward/pyop3-gpu/pyop3/device.py + + An implementation of the 'device' context manager. It needs a big refactor. + + * https://github.com/OP2/PyOP2/pull/691/changes#diff-f8765d963b5adb1788f453e259d8cd45f29cee9670563ddb99b9fe2bba115a12 + + Using a wrapper type to track changes between host and device. In pyop3 + this would be the 'ArrayBuffer' object and link into existing + state tracking. +""" + +import numpy as np + +from firedrake import * +import pyop3 as op3 + +from pyop3.device import on_host + + +# made up API, we need some way to identify the device +host = op3.HOST_DEVICE # or similar +gpu = op3.CUDAGPU() + +mesh = UnitSquareMesh(3, 3) +V = FunctionSpace(mesh, "P", 2) + +f = Function(V).assign(10) +g = Function(V) + +assert isinstance(f.dat.data_ro, np.ndarray) +assert isinstance(g.dat.data_ro, np.ndarray) + +# state tracking checks, .buffer.state is now device-specific +assert f.dat.buffer.state[host] == 1 # modified once +assert f.dat.buffer.state[gpu] == -1 # not created +assert g.dat.buffer.state[host] == 0 # untouched +assert g.dat.buffer.state[gpu] == -1 # not created + +with op3.offloading(gpu): + # Getting the .data attribute on the GPU should give us back a GPU array type + assert not isinstance(f.dat.data_ro, np.ndarray) + assert not isinstance(g.dat.data_ro, np.ndarray) + + # # Do the assignment using array operations + g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="array") + + # # Do the assignment using MLIR (this is a later step) + # # g.dat.assign(2*f.dat + 3, eager=True, eager_strategy="compile") + k = Function(V) + k.dat.buffer.duplicate() + k.dat.buffer.duplicate(copy=True) + + k.dat.data_rw[...] = 3 + + # state tracking checks + assert f.dat.buffer.state[host] == 1 # modified once + assert f.dat.buffer.state[gpu] == 1 # matches host + assert g.dat.buffer.state[host] == 0 # untouched + assert g.dat.buffer.state[gpu] == 1 # modified once + assert k.dat.buffer.state[host] == 0 # not modified + assert k.dat.buffer.state[gpu] == 1 # modified + +assert isinstance(f.dat.data_ro, np.ndarray) +assert isinstance(g.dat.data_ro, np.ndarray) +assert (g.dat.data_ro == 23).all() +assert (k.dat.data_ro == 3).all() + +# state tracking checks +assert f.dat.buffer.state[host] == 1 # modified once +assert f.dat.buffer.state[gpu] == 1 # matches host +assert g.dat.buffer.state[host] == 1 # matches device +assert g.dat.buffer.state[gpu] == 1 # modified once +assert k.dat.buffer.state[host] == 1 # matches device +assert k.dat.buffer.state[gpu] == 1 # modified once From 74eb11fcc0fdaf138dbf771acd8d6f2872f83f93 Mon Sep 17 00:00:00 2001 From: SamSJackson Date: Wed, 13 May 2026 10:58:58 +0100 Subject: [PATCH 36/36] pr: changes to remove _data for get_array --- pyop3/buffer.py | 54 ++++++++++++------------- pyop3/expr/visitors/__init__.py | 2 +- pyop3/insn/exec.py | 5 ++- pyop3/tree/axis_tree/visitors/layout.py | 2 +- pyop3/tree/axis_tree/visitors/size.py | 2 +- tests/pyop3/unit/test_gpu_context.py | 2 +- 6 files changed, 33 insertions(+), 34 deletions(-) diff --git a/pyop3/buffer.py b/pyop3/buffer.py index d53833d4f3..37c2996dae 100644 --- a/pyop3/buffer.py +++ b/pyop3/buffer.py @@ -57,17 +57,6 @@ def wrapper(self, *args, **kwargs): return wrapper - -def record_modified(func): - def wrapper(self, *args, **kwargs): - assert not self.constant - try: - return func(self, *args, **kwargs) - finally: - self.inc_state() - return wrapper - - class AbstractBuffer(DistributedObject, metaclass=abc.ABCMeta): DEFAULT_PREFIX = "buffer" @@ -285,13 +274,14 @@ def __init__( self.__post_init__() def __post_init__(self) -> None: + curr_dev = get_current_device() assert self.sf.size == self.size if self.rank_equal: assert self.constant if self.ordered: utils.debug_assert(lambda: utils.is_sorted(self._lazy_data)) - if self.constant and isinstance(self._data, np.ndarray): - assert not self._data.flags.writeable + if self.constant and isinstance(self._lazy_data[curr_dev], np.ndarray): + assert not self._lazy_data[curr_dev].flags.writeable # }}} @@ -312,11 +302,11 @@ def __post_init__(self) -> None: @property def size(self) -> int: - return self._data.size + return self.get_array().size @property def dtype(self) -> np.dtype: - return self._data.dtype + return self.get_array().dtype @property def _last_updated_device(self) -> Device: @@ -347,7 +337,7 @@ def duplicate(self, *, copy: bool = False) -> ArrayBuffer: @property def handle(self) -> np.ndarray | cp.ndarray: - return self._data + return self.get_array() @property def comm(self) -> MPI.Comm: @@ -441,7 +431,6 @@ def data(self): return self.data_rw @property - @record_modified @not_in_flight def data_rw(self): if not self._roots_valid: @@ -451,7 +440,7 @@ def data_rw(self): # modifying owned values invalidates ghosts self._leaves_valid = False - return self._data + return self.get_array("rw") # TODO: It would be good to be able to get data_ro but without updating the halos # The issue with the previous approach is we would only return the owned data. This @@ -464,10 +453,9 @@ def data_ro(self): self.reduce_leaves_to_roots() if not self._leaves_valid: self.broadcast_roots_to_leaves() - return readonly(self._data) + return readonly(self.get_array("ro")) @property - @record_modified @not_in_flight def data_wo(self): """ @@ -480,7 +468,7 @@ def data_wo(self): # pending writes can be dropped self._pending_reduction = None self._leaves_valid = False - return self._data + return self.get_array("wo") @not_in_flight def assemble(self) -> None: @@ -490,13 +478,15 @@ def assemble(self) -> None: def leaves_valid(self) -> bool: return self._leaves_valid - @property - def _data(self): + def get_array(self, intent: Literal["ro", "rw", "wo"] = "ro"): curr_dev = get_current_device() if not self._is_data_available_and_synced(curr_dev): self.sync_devices() + if intent in {"wo", "rw"}: + self.inc_state() + return self._lazy_data[curr_dev] # TODO: I think the halo bits should only be handled at the Dat level via the @@ -534,14 +524,16 @@ def reduce_leaves_to_roots(self): @not_in_flight def reduce_leaves_to_roots_begin(self): + curr_dev = get_current_device() if not self._roots_valid: self.sf.reduce_begin( - self._data, self._reduction_ops[self._pending_reduction] + self._lazy_data[curr_dev], self._reduction_ops[self._pending_reduction] ) self._leaves_valid = False self._finalizer = self.reduce_leaves_to_roots_end def reduce_leaves_to_roots_end(self): + curr_dev = get_current_device() if self._finalizer is None: raise BadOrderingException( "Should not call _reduce_leaves_to_roots_end without first calling " @@ -551,7 +543,7 @@ def reduce_leaves_to_roots_end(self): raise DataTransferInFlightException("Wrong finalizer called") if not self._roots_valid: - self.sf.reduce_end(self._data, self._reduction_ops[self._pending_reduction]) + self.sf.reduce_end(self._lazy_data[curr_dev], self._reduction_ops[self._pending_reduction]) self._pending_reduction = None self._finalizer = None @@ -563,14 +555,16 @@ def broadcast_roots_to_leaves(self): @not_in_flight def broadcast_roots_to_leaves_begin(self): + curr_dev = get_current_device() if not self._roots_valid: raise RuntimeError("Cannot broadcast invalid roots") if not self._leaves_valid: - self.sf.broadcast_begin(self._data, MPI.REPLACE) + self.sf.broadcast_begin(self._lazy_data[curr_dev], MPI.REPLACE) object.__setattr__(self, "_finalizer", self.broadcast_roots_to_leaves_end) def broadcast_roots_to_leaves_end(self): + curr_dev = get_current_device() if self._finalizer is None: raise BadOrderingException( "Should not call _broadcast_roots_to_leaves_end without first " @@ -580,7 +574,7 @@ def broadcast_roots_to_leaves_end(self): raise DataTransferInFlightException("Wrong finalizer called") if not self._leaves_valid: - self.sf.broadcast_end(self._data, MPI.REPLACE) + self.sf.broadcast_end(self._lazy_data[curr_dev], MPI.REPLACE) self._leaves_valid = True self._finalizer = None @@ -617,7 +611,11 @@ def sync_devices(self): last_updated_device = self._last_updated_device current_device = get_current_device() - self._lazy_data[current_device] = current_device.asarray(self._lazy_data[last_updated_device], constant=self.constant) + self._lazy_data[current_device] = current_device.asarray( + self._lazy_data[last_updated_device], + constant=self.constant + ) + self._state[current_device] = self._state[last_updated_device] def _is_data_available_and_synced(self, device: Device) -> bool: diff --git a/pyop3/expr/visitors/__init__.py b/pyop3/expr/visitors/__init__.py index 38d528eb1e..b61ac0a098 100644 --- a/pyop3/expr/visitors/__init__.py +++ b/pyop3/expr/visitors/__init__.py @@ -1112,7 +1112,7 @@ def get_extremum(expr, extremum: Literal["max", "min"]) -> numbers.Number: result.assign(fn(result, expr)), eager=True ) - return utils.just_one(result.buffer._data) + return utils.just_one(result.buffer.get_array()) def max_(a, b, /, *, lazy: bool = False) -> pyop3.expr.Conditional | numbers.Number: diff --git a/pyop3/insn/exec.py b/pyop3/insn/exec.py index 5514f7b158..77533121bb 100644 --- a/pyop3/insn/exec.py +++ b/pyop3/insn/exec.py @@ -473,7 +473,7 @@ def _buffer_str(self, buffer): @_buffer_str.register def _(self, buffer: pyop3.buffer.ArrayBuffer): - return f"({buffer.size})", str(buffer._data) + return f"({buffer.size})", str(buffer.get_array()) @_buffer_str.register def _(self, buffer: pyop3.buffer.PetscMatBuffer) -> str: @@ -610,7 +610,8 @@ def _buffer_exchanges(self, buffer, intent): nil = dtype_limits(buffer.dtype).min def _init_nil(): - buffer._data[buffer.sf.ileaf] = nil + # Not modifying owned values so don't want to update state via intent + buffer.get_array()[buffer.sf.ileaf] = nil reductions.append(_init_nil) diff --git a/pyop3/tree/axis_tree/visitors/layout.py b/pyop3/tree/axis_tree/visitors/layout.py index c9525b5eb0..4383bd59ce 100644 --- a/pyop3/tree/axis_tree/visitors/layout.py +++ b/pyop3/tree/axis_tree/visitors/layout.py @@ -234,7 +234,7 @@ def _compute_layouts(axis_tree: AxisTree) -> idict[ConcretePathT, ExpressionT]: # Lastly 'freeze' the offset dats so they can no longer be modified for _, offset_dat in to_tabulate: object.__setattr__(offset_dat.buffer, "_constant", True) - offset_dat.buffer._data.flags.writeable = False + offset_dat.buffer.get_array().flags.writeable = False return layouts diff --git a/pyop3/tree/axis_tree/visitors/size.py b/pyop3/tree/axis_tree/visitors/size.py index 68dfa34700..70742ec773 100644 --- a/pyop3/tree/axis_tree/visitors/size.py +++ b/pyop3/tree/axis_tree/visitors/size.py @@ -137,7 +137,7 @@ def compute_axis_tree_component_size(axis_tree: AbstractAxisTree, path: PathT, c if component_size_axes is UNIT_AXIS_TREE: # ick way to make sure that if we have sizes wrapped up into Scalars that this # gets passed up - mysize = utils.just_one(component_size.buffer._data) + mysize = utils.just_one(component_size.buffer.get_array()) if not isinstance(subtree_size, numbers.Integral): sbuf = ArrayBuffer.from_scalar(mysize, constant=True) mysize = ScalarBufferExpression(sbuf) diff --git a/tests/pyop3/unit/test_gpu_context.py b/tests/pyop3/unit/test_gpu_context.py index 51306abad1..472185d470 100644 --- a/tests/pyop3/unit/test_gpu_context.py +++ b/tests/pyop3/unit/test_gpu_context.py @@ -123,4 +123,4 @@ def test_duplicate_copy_to_device(self, space): k = Function(space) k_dup_buffer = k.dat.buffer.duplicate(copy=True) assert isinstance(k_dup_buffer.data_ro, cp.ndarray) - assert k_dup_buffer._data is k.dat.buffer._data + assert k_dup_buffer.get_array() is k.dat.buffer.get_array()