NVIDIA · loliverhennigh · Apr 29, 2026 · Apr 30, 2026 · May 1, 2026 · May 1, 2026
@@ -47,7 +47,7 @@
     # In minumum versions are met, we can import the shard tensor and spec.
 
     from ._shard_tensor_spec import ShardTensorSpec
-    from .shard_tensor import ShardTensor, scatter_tensor
+    from .shard_tensor import ShardTensor, replicated_zeros_like, scatter_tensor
 
     def register_custom_ops():
         # These imports will register the custom ops with the ShardTensor class.
@@ -69,3 +69,4 @@ def register_custom_ops():
     ShardTensor = None
     ShardTensorSpec = None
     scatter_tensor = None
+    replicated_zeros_like = None
diff --git a/physicsnemo/domain_parallel/shard_tensor.py b/physicsnemo/domain_parallel/shard_tensor.py
@@ -908,6 +908,38 @@ def to_local(
 
         return _ToTorchTensor.apply(self, grad_placements)
 
+    def new_replicated_zeros(
+        self,
+        shape: Sequence[int] | torch.Size,
+        *,
+        dtype: torch.dtype | None = None,
+    ) -> "ShardTensor":
+        r"""Create a replicated zero tensor on this tensor's mesh.
+
+        This is useful for reductions/accumulators where an op naturally produces
+        a replicated output regardless of input placement.
+
+        Parameters
+        ----------
+        shape : Sequence[int] or torch.Size
+            Global shape of the output tensor.
+        dtype : torch.dtype, optional
+            Output dtype. Defaults to this tensor's dtype.
+
+        Returns
+        -------
+        ShardTensor
+            A replicated ShardTensor of zeros on the same mesh.
+        """
+        out_dtype = self.dtype if dtype is None else dtype
+        local = torch.zeros(tuple(shape), dtype=out_dtype, device=self.device)
+        return ShardTensor.from_local(
+            local,
+            self._spec.mesh,
+            [Replicate() for _ in range(self._spec.mesh.ndim)],
+            sharding_shapes="infer",
+        )
+
     def full_tensor(
         self, *, grad_placements: Sequence[Placement] | None = None
     ) -> torch.Tensor:
@@ -965,6 +997,71 @@ def backward(self, *args, **kwargs):
         return self.to_local().backward(*args, **kwargs)
 
 
+def replicated_zeros_like(
+    tensor: torch.Tensor,
+    shape: Sequence[int] | torch.Size,
+    *,
+    dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    r"""Create zeros matching a tensor's device/mesh semantics.
+
+    For ``ShardTensor`` inputs this returns a replicated ``ShardTensor`` on the
+    same mesh. For regular tensors this falls back to ``torch.zeros`` on the
+    input device.
+    """
+    if isinstance(tensor, ShardTensor):
+        return tensor.new_replicated_zeros(shape, dtype=dtype)
+    out_dtype = tensor.dtype if dtype is None else dtype
+    return torch.zeros(tuple(shape), dtype=out_dtype, device=tensor.device)
+
+
+def _cross_wrapper(func, types, args, kwargs):
+    if kwargs is None:
+        kwargs = {}
+
+    if kwargs.get("out", None) is not None:
+        raise RuntimeError("torch.linalg.cross(out=...) is not supported for ShardTensor.")
+
+    input_tensor = args[0] if len(args) > 0 else kwargs.get("input")
+    other_tensor = args[1] if len(args) > 1 else kwargs.get("other")
+    dim = kwargs.get("dim", -1)
+    if len(args) > 2:
+        dim = args[2]
+
+    if not isinstance(input_tensor, ShardTensor) or not isinstance(
+        other_tensor, ShardTensor
+    ):
+        raise RuntimeError(
+            "torch.linalg.cross with ShardTensor inputs requires both arguments to be ShardTensor."
+        )
-    if not isinstance(input_tensor, ShardTensor) or not isinstance(
-        other_tensor, ShardTensor
-    ):
-        raise RuntimeError(
-            "torch.linalg.cross with ShardTensor inputs requires both arguments to be ShardTensor."
-        )
+        raise RuntimeError(
+            f"{func.__module__}.{func.__name__} with ShardTensor inputs requires both arguments to be ShardTensor."
+        )
-    if not isinstance(input_tensor, ShardTensor) or not isinstance(
-        other_tensor, ShardTensor
-    ):
-        raise RuntimeError(
-            "torch.linalg.cross with ShardTensor inputs requires both arguments to be ShardTensor."
-        )
+        raise RuntimeError(
+            f"{func.__module__}.{func.__name__} with ShardTensor inputs requires both arguments to be ShardTensor."
+        )
+
+    if input_tensor._spec.mesh != other_tensor._spec.mesh:
+        raise RuntimeError(
+            "torch.linalg.cross requires both ShardTensor inputs to share the same device mesh."
+        )
+    if input_tensor._spec.placements != other_tensor._spec.placements:
+        raise RuntimeError(
+            "torch.linalg.cross requires both ShardTensor inputs to have identical placements."
+        )
+
+    local_result = torch.linalg.cross(
+        input_tensor.to_local(),
+        other_tensor.to_local(),
+        dim=dim,
+    )
+    return ShardTensor.from_local(
+        local_result,
+        input_tensor._spec.mesh,
+        input_tensor._spec.placements,
+        sharding_shapes=input_tensor._spec.sharding_shapes(),
+    )
+
+
+ShardTensor.register_function_handler(torch.linalg.cross, _cross_wrapper)
+if hasattr(torch, "cross"):
+    ShardTensor.register_function_handler(torch.cross, _cross_wrapper)
+
+
 def scatter_tensor(
     tensor: torch.Tensor,
     global_src: int,

@@ -1463,15 +1463,14 @@ def cell_data_to_point_data(self, overwrite_keys: bool = False) -> "Mesh":
         # Shape: (n_cells * n_vertices_per_cell,)
         point_indices = self.cells.flatten()
 
-        # Corresponding cell index for each point
-        # Shape: (n_cells * n_vertices_per_cell,)
-        cell_indices = torch.arange(
-            self.n_cells, device=self.points.device
-        ).repeat_interleave(n_vertices_per_cell)
+        # Repeat each cell value once per incident vertex.
+        # This avoids advanced indexing with an explicit cell-index tensor.
 
         converted = self.cell_data.apply(
             lambda cell_values: scatter_aggregate(
-                src_data=cell_values[cell_indices],
+                src_data=cell_values.unsqueeze(1)
+                .expand(-1, n_vertices_per_cell, *cell_values.shape[1:])
+                .reshape(-1, *cell_values.shape[1:]),
                 src_to_dst_mapping=point_indices,
                 n_dst=self.n_points,
                 weights=None,

@@ -24,9 +24,28 @@
 import torch
 from jaxtyping import Float, Int
 
+from physicsnemo.domain_parallel import replicated_zeros_like
 from physicsnemo.mesh.utilities._tolerances import safe_eps
 
 
+def _is_sharded_tensor(tensor: torch.Tensor) -> bool:
+    return hasattr(tensor, "_spec") and hasattr(type(tensor), "from_local")
+
+
+def _replicated_zeros_like(
+    tensor: torch.Tensor, shape: tuple[int, ...], dtype: torch.dtype
+) -> torch.Tensor:
+    if not _is_sharded_tensor(tensor) or replicated_zeros_like is None:
+        return torch.zeros(shape, dtype=dtype, device=tensor.device)
+
+    # Delegate replicated temporary allocation to the ShardTensor layer.
+    return replicated_zeros_like(
+        tensor,
+        shape,
+        dtype=dtype,
+    )
+
+
 def scatter_aggregate(
     src_data: Float[torch.Tensor, "n_src ..."],
     src_to_dst_mapping: Int[torch.Tensor, " n_src"],
@@ -93,7 +112,9 @@ def scatter_aggregate(
 
     ### Fast path: unweighted sum is a single scatter_add_ with no extra work
     if weights is None and aggregation == "sum":
-        aggregated_data = torch.zeros((n_dst, *data_shape), dtype=dtype, device=device)
+        aggregated_data = _replicated_zeros_like(
+            src_data, (n_dst, *data_shape), dtype
+        )
         expanded_indices = src_to_dst_mapping.view(
             -1, *([1] * len(data_shape))
         ).expand_as(src_data)
@@ -102,7 +123,10 @@ def scatter_aggregate(
 
     ### Initialize weights if not provided
     if weights is None:
-        weights = torch.ones(len(src_to_dst_mapping), dtype=dtype, device=device)
+        if _is_sharded_tensor(src_data):
+            weights = torch.ones_like(src_to_dst_mapping, dtype=dtype)
+        else:
+            weights = torch.ones(len(src_to_dst_mapping), dtype=dtype, device=device)
 
     ### Ensure weights have same dtype as data (avoid dtype mismatch in multiplication)
     if weights.dtype != dtype:
@@ -114,11 +138,7 @@ def scatter_aggregate(
     weighted_data = src_data * weights.view(weight_shape)
 
     ### Scatter-add weighted data to destinations
-    aggregated_data = torch.zeros(
-        (n_dst, *data_shape),
-        dtype=dtype,
-        device=device,
-    )
+    aggregated_data = _replicated_zeros_like(src_data, (n_dst, *data_shape), dtype)
 
     # Expand src_to_dst_mapping to match data dimensions
     expanded_indices = src_to_dst_mapping.view(-1, *([1] * len(data_shape))).expand_as(
@@ -134,7 +154,7 @@ def scatter_aggregate(
     ### Normalize weighted sum to weighted mean
     if aggregation == "mean":
         ### Compute sum of weights at each destination
-        weight_sums = torch.zeros(n_dst, dtype=dtype, device=device)
+        weight_sums = _replicated_zeros_like(src_data, (n_dst,), dtype)
         weight_sums.scatter_add_(
             dim=0,
             index=src_to_dst_mapping,