Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 47 additions & 11 deletions ding/policy/base_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ding.model import create_model
from ding.utils import import_module, allreduce, allreduce_with_indicator, broadcast, get_rank, allreduce_async, \
synchronize, deep_merge_dicts, POLICY_REGISTRY
from ding.torch_utils import auto_device_init, move_to_device


class Policy(ABC):
Expand Down Expand Up @@ -83,8 +84,12 @@ def default_config(cls: type) -> EasyDict:
config = dict(
# (bool) Whether the learning policy is the same as the collecting data policy (on-policy).
on_policy=False,
# (bool) Whether to use cuda in policy.
# (bool) Whether to use cuda in policy (deprecated, use 'device' instead).
cuda=False,
# (str) Device to use for policy. Can be 'auto', 'cuda', 'npu', or 'cpu'.
# 'auto' will automatically detect NPU > GPU > CPU.
# If not specified, will use 'cuda' config for backward compatibility.
device='auto',
# (bool) Whether to use data parallel multi-gpu mode in policy.
multi_gpu=False,
# (bool) Whether to synchronize update the model parameters after allreduce the gradients of model parameters.
Expand Down Expand Up @@ -136,25 +141,56 @@ def __init__(

if len(set(self._enable_field).intersection(set(['learn', 'collect', 'eval']))) > 0:
model = self._create_model(cfg, model)
self._cuda = cfg.cuda and torch.cuda.is_available()

# Device initialization with auto-detection support for NPU/GPU/CPU
# Backward compatibility: if 'device' not in cfg, use 'cuda' config
if hasattr(cfg, 'device') and cfg.device is not None:
# New way: use 'device' config for auto-detection or explicit setting
cfg_device = cfg.device
else:
# Legacy way: convert 'cuda' boolean to device string
cfg_device = 'cuda' if (hasattr(cfg, 'cuda') and cfg.cuda) else 'cpu'

# now only support multi-gpu for only enable learn mode
if len(set(self._enable_field).intersection(set(['learn']))) > 0:
multi_gpu = self._cfg.multi_gpu
self._rank = get_rank() if multi_gpu else 0
if self._cuda:
# model.cuda() is an in-place operation.
model.cuda()
else:
self._rank = 0

# Auto-detect or set device
self._device_type, self._use_accelerator, self._device = auto_device_init(cfg_device, self._rank)

# Keep backward compatibility with _cuda attribute
# Set _cuda=True for ANY accelerator (GPU or NPU) to ensure data transfer logic works
self._cuda = self._use_accelerator

# Move model to the detected/configured device
if self._use_accelerator:
move_to_device(model, self._device_type, self._rank)

# Print final device configuration summary
print(f"\n{'='*70}")
print(f"🎉 [DI-engine Policy] Device Setup Complete")
print(f"{'='*70}")
print(f" Policy Type: {self.__class__.__name__}")
print(f" Device Type: {self._device_type.upper()}")
print(f" Device String: {self._device}")
print(f" Using Accelerator: {self._use_accelerator}")
print(f" Rank: {self._rank}")
print(f" Multi-GPU: {self._cfg.multi_gpu if hasattr(self._cfg, 'multi_gpu') else False}")
print(f" Legacy _cuda flag: {self._cuda}")
print(f"{'='*70}\n")

# Multi-GPU initialization
if len(set(self._enable_field).intersection(set(['learn']))) > 0:
multi_gpu = self._cfg.multi_gpu
if multi_gpu:
bp_update_sync = self._cfg.bp_update_sync
self._bp_update_sync = bp_update_sync
self._init_multi_gpu_setting(model, bp_update_sync)
else:
self._rank = 0
if self._cuda:
# model.cuda() is an in-place operation.
model.cuda()

self._model = model
self._device = 'cuda:{}'.format(self._rank % torch.cuda.device_count()) if self._cuda else 'cpu'
else:
self._cuda = False
self._rank = 0
Expand Down
2 changes: 2 additions & 0 deletions ding/torch_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@
from .dataparallel import DataParallel
from .reshape_helper import fold_batch, unfold_batch, unsqueeze_repeat
from .parameter import NonegativeParameter, TanhParameter
from .device_helper import get_available_device, get_device_count, move_to_device, get_device_string, \
auto_device_init, is_npu_available, is_cuda_available
224 changes: 224 additions & 0 deletions ding/torch_utils/device_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
"""
Copyright 2020 Sensetime X-lab. All Rights Reserved.

Device helper utilities for automatic detection of NPU and GPU devices.
Supports Huawei Ascend NPU (torch_npu) and NVIDIA GPU (torch.cuda).
"""

import torch
from typing import Tuple, Optional
import logging

# Try to import torch_npu for Huawei NPU support
try:
import torch_npu
TORCH_NPU_AVAILABLE = True
except ImportError:
TORCH_NPU_AVAILABLE = False

logger = logging.getLogger(__name__)


def get_available_device() -> Tuple[str, bool]:
"""
Overview:
Automatically detect the available device (NPU or GPU or CPU).
Priority: NPU > GPU > CPU
Returns:
- device_type (:obj:`str`): Device type string, one of 'npu', 'cuda', 'cpu'
- is_accelerator (:obj:`bool`): Whether an accelerator (NPU/GPU) is available
Examples:
>>> device_type, is_accelerator = get_available_device()
>>> print(f"Using device: {device_type}")
"""
print("\n" + "="*70)
print("🔍 [DI-engine] Device Detection")
print("="*70)

# Check for NPU first (Huawei Ascend)
if TORCH_NPU_AVAILABLE:
print("✓ torch_npu module is installed")
if torch.npu.is_available():
npu_count = torch.npu.device_count()
print(f"✓ NPU is available: {npu_count} device(s) detected")
print(f"✓ NPU device names: {[torch.npu.get_device_name(i) for i in range(npu_count)]}")
print(f"🎯 Selected device: NPU")
print("="*70 + "\n")
logger.info(f"[Device] Using NPU with {npu_count} device(s)")
return 'npu', True
else:
print("✗ NPU is not available")
else:
print("✗ torch_npu module is not installed")

# Check for CUDA GPU
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
print(f"✓ CUDA is available: {gpu_count} device(s) detected")
print(f"✓ GPU device names: {[torch.cuda.get_device_name(i) for i in range(gpu_count)]}")
print(f"🎯 Selected device: CUDA GPU")
print("="*70 + "\n")
logger.info(f"[Device] Using CUDA GPU with {gpu_count} device(s)")
return 'cuda', True
else:
print("✗ CUDA is not available")

# Fallback to CPU
print("🎯 Selected device: CPU (no accelerator detected)")
print("="*70 + "\n")
logger.info("[Device] Using CPU (no accelerator available)")
return 'cpu', False


def get_device_count(device_type: str) -> int:
"""
Overview:
Get the number of available devices for the specified device type.
Arguments:
- device_type (:obj:`str`): Device type, one of 'npu', 'cuda', 'cpu'
Returns:
- count (:obj:`int`): Number of available devices
"""
if device_type == 'npu' and TORCH_NPU_AVAILABLE:
return torch.npu.device_count()
elif device_type == 'cuda':
return torch.cuda.device_count()
else:
return 1 # CPU always has 1 "device"


def move_to_device(model: torch.nn.Module, device_type: str, rank: int = 0) -> torch.nn.Module:
"""
Overview:
Move a PyTorch model to the specified device.
Supports NPU, CUDA, and CPU devices.
Arguments:
- model (:obj:`torch.nn.Module`): The model to move
- device_type (:obj:`str`): Device type, one of 'npu', 'cuda', 'cpu'
- rank (:obj:`int`): Device rank for multi-device setups
Returns:
- model (:obj:`torch.nn.Module`): The model moved to the device (in-place operation)
"""
if device_type == 'npu' and TORCH_NPU_AVAILABLE:
device_count = torch.npu.device_count()
device_id = rank % device_count if device_count > 0 else 0
print(f"📦 [DI-engine] Moving model to NPU device {device_id} (rank={rank})")
model.npu(device_id)
logger.info(f"[Device] Model moved to NPU device {device_id}")
elif device_type == 'cuda':
device_count = torch.cuda.device_count()
device_id = rank % device_count if device_count > 0 else 0
print(f"📦 [DI-engine] Moving model to CUDA device {device_id} (rank={rank})")
model.cuda(device_id)
logger.info(f"[Device] Model moved to CUDA device {device_id}")
else:
print(f"📦 [DI-engine] Model will stay on CPU")
logger.info("[Device] Model stays on CPU")
# CPU case: no need to move
return model


def get_device_string(device_type: str, rank: int = 0) -> str:
"""
Overview:
Get the device string for PyTorch tensor operations.
Arguments:
- device_type (:obj:`str`): Device type, one of 'npu', 'cuda', 'cpu'
- rank (:obj:`int`): Device rank for multi-device setups
Returns:
- device_str (:obj:`str`): Device string like 'npu:0', 'cuda:0', or 'cpu'
"""
if device_type in ['npu', 'cuda']:
device_count = get_device_count(device_type)
device_id = rank % device_count if device_count > 0 else 0
return f'{device_type}:{device_id}'
else:
return 'cpu'


def auto_device_init(cfg_device: Optional[str], rank: int = 0) -> Tuple[str, bool, str]:
"""
Overview:
Initialize device settings based on config.
Supports automatic detection, explicit device type, or legacy 'cuda' boolean.
Arguments:
- cfg_device (:obj:`Optional[str]`): Device configuration from config.
Can be 'auto', 'npu', 'cuda', 'cpu', or None (defaults to 'auto')
- rank (:obj:`int`): Device rank for multi-device setups
Returns:
- device_type (:obj:`str`): Detected device type ('npu', 'cuda', or 'cpu')
- use_accelerator (:obj:`bool`): Whether an accelerator is being used
- device_str (:obj:`str`): Full device string for PyTorch operations
Examples:
>>> device_type, use_accelerator, device_str = auto_device_init('auto')
>>> # Returns ('npu', True, 'npu:0') if NPU available
>>> # Returns ('cuda', True, 'cuda:0') if GPU available
>>> # Returns ('cpu', False, 'cpu') otherwise
"""
print(f"\n⚙️ [DI-engine] Device Configuration: cfg_device='{cfg_device}', rank={rank}")

# Default to auto detection if not specified
if cfg_device is None or cfg_device == 'auto':
print(f"🔧 [DI-engine] Using auto-detection mode")
device_type, use_accelerator = get_available_device()
else:
# Explicit device type specified
device_type = cfg_device.lower()
print(f"🔧 [DI-engine] Explicit device type requested: '{device_type}'")

# Validate the device type is available
if device_type == 'npu':
if TORCH_NPU_AVAILABLE and torch.npu.is_available():
use_accelerator = True
npu_count = torch.npu.device_count()
print(f"✓ NPU requested and available: {npu_count} device(s)")
logger.info(f"[Device] Using NPU as explicitly configured ({npu_count} device(s))")
else:
print(f"⚠️ NPU requested but not available, falling back to CPU")
logger.warning("[Device] NPU requested but not available, falling back to CPU")
device_type = 'cpu'
use_accelerator = False
elif device_type == 'cuda':
if torch.cuda.is_available():
use_accelerator = True
gpu_count = torch.cuda.device_count()
print(f"✓ CUDA requested and available: {gpu_count} device(s)")
logger.info(f"[Device] Using CUDA GPU as explicitly configured ({gpu_count} device(s))")
else:
print(f"⚠️ CUDA requested but not available, falling back to CPU")
logger.warning("[Device] CUDA requested but not available, falling back to CPU")
device_type = 'cpu'
use_accelerator = False
else:
# CPU or any other value
device_type = 'cpu'
use_accelerator = False
print(f"✓ Using CPU as configured")
logger.info("[Device] Using CPU as configured")

device_str = get_device_string(device_type, rank)

print(f"✅ [DI-engine] Device initialized: type={device_type}, accelerator={use_accelerator}, device_string='{device_str}'")
print("="*70 + "\n")

return device_type, use_accelerator, device_str


def is_npu_available() -> bool:
"""
Overview:
Check if Huawei NPU is available.
Returns:
- available (:obj:`bool`): True if NPU is available
"""
return TORCH_NPU_AVAILABLE and torch.npu.is_available()


def is_cuda_available() -> bool:
"""
Overview:
Check if NVIDIA CUDA GPU is available.
Returns:
- available (:obj:`bool`): True if CUDA is available
"""
return torch.cuda.is_available()
25 changes: 22 additions & 3 deletions ding/utils/default_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
import torch
import treetensor.torch as ttorch

# Try to import torch_npu for Huawei NPU support
try:
import torch_npu
TORCH_NPU_AVAILABLE = True
except ImportError:
TORCH_NPU_AVAILABLE = False


def get_shape0(data: Union[List, Dict, torch.Tensor, ttorch.Tensor]) -> int:
"""
Expand Down Expand Up @@ -418,7 +425,7 @@ def set_pkg_seed(seed: int, use_cuda: bool = True) -> None:
This is usaually used in entry scipt in the section of setting random seed for all package and instance
Argument:
- seed(:obj:`int`): Set seed
- use_cuda(:obj:`bool`) Whether use cude
- use_cuda(:obj:`bool`) Whether use cuda or other accelerators (NPU/GPU)
Examples:
>>> # ../entry/xxxenv_xxxpolicy_main.py
>>> ...
Expand All @@ -431,11 +438,23 @@ def set_pkg_seed(seed: int, use_cuda: bool = True) -> None:
>>> ...

"""
print(f"\n🌱 [DI-engine] Setting random seed: {seed}")
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if use_cuda and torch.cuda.is_available():
torch.cuda.manual_seed(seed)
print(f" ✓ Set seed for: random, numpy, torch")

# Set seed for accelerators (GPU or NPU)
if use_cuda:
# Set CUDA seed if available
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
print(f" ✓ Set CUDA seed: {seed}")
# Set NPU seed if available
if TORCH_NPU_AVAILABLE and torch.npu.is_available():
torch.npu.manual_seed(seed)
print(f" ✓ Set NPU seed: {seed}")
print()


@lru_cache()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
stop_value=195,
),
policy=dict(
cuda=False,
device='auto', # Auto-detect NPU > GPU > CPU
action_space='discrete',
model=dict(
obs_shape=4,
Expand Down
Loading