exo-explore · rltakashige · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/.cuda_typings/openai_harmony/__init__.pyi b/.cuda_typings/openai_harmony/__init__.pyi
@@ -0,0 +1,20 @@
+from enum import Enum
+
+class HarmonyEncodingName(Enum):
+    HARMONY_GPT_OSS = ...
+
+class HarmonyEncoding: ...
+class HarmonyError(Exception): ...
+
+class Role(Enum):
+    ASSISTANT = ...
+
+class StreamableParser:
+    last_content_delta: str
+    current_channel: str | None
+    current_recipient: str | None
+
+    def __init__(self, encoding: HarmonyEncoding, role: Role = ...) -> None: ...
+    def process(self, token_id: int) -> None: ...
+
+def load_harmony_encoding(name: HarmonyEncodingName) -> HarmonyEncoding: ...
diff --git a/.cuda_typings/torch/__init__.pyi b/.cuda_typings/torch/__init__.pyi
@@ -0,0 +1,8 @@
+from torch import backends as backends
+from torch import cuda as cuda
+from torch import distributed as distributed
+
+__version__: str
+
+class version:
+    cuda: str
diff --git a/.cuda_typings/torch/backends/__init__.pyi b/.cuda_typings/torch/backends/__init__.pyi
@@ -0,0 +1 @@
+from torch.backends import cuda as cuda
diff --git a/.cuda_typings/torch/backends/cuda/__init__.pyi b/.cuda_typings/torch/backends/cuda/__init__.pyi
@@ -0,0 +1 @@
+def is_built() -> bool: ...
diff --git a/.cuda_typings/torch/cuda/__init__.pyi b/.cuda_typings/torch/cuda/__init__.pyi
@@ -0,0 +1,8 @@
+class _DeviceProperties:
+    total_memory: int
+
+def is_available() -> bool: ...
+def get_device_name(device: int) -> str: ...
+def get_device_properties(device: int) -> _DeviceProperties: ...
+def empty_cache() -> None: ...
+def mem_get_info() -> tuple[int, int]: ...
diff --git a/.cuda_typings/torch/distributed/__init__.pyi b/.cuda_typings/torch/distributed/__init__.pyi
@@ -0,0 +1,2 @@
+def is_initialized() -> bool: ...
+def destroy_process_group() -> None: ...
diff --git a/.cuda_typings/vllm/__init__.pyi b/.cuda_typings/vllm/__init__.pyi
@@ -0,0 +1 @@
+__version__: str
diff --git a/.cuda_typings/vllm/config.pyi b/.cuda_typings/vllm/config.pyi
@@ -0,0 +1,2 @@
+class ModelConfig:
+    max_model_len: int
diff --git a/.cuda_typings/vllm/engine/__init__.pyi b/.cuda_typings/vllm/engine/__init__.pyi
diff --git a/.cuda_typings/vllm/engine/arg_utils.pyi b/.cuda_typings/vllm/engine/arg_utils.pyi
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+
+@dataclass
+class EngineArgs:
+    model: str = ...
+    served_model_name: str | list[str] | None = ...
+    tokenizer: str | None = ...
+    trust_remote_code: bool = ...
+    dtype: str = ...
+    seed: int = ...
+    max_model_len: int | None = ...
+    gpu_memory_utilization: float = ...
+    enforce_eager: bool = ...
+    tensor_parallel_size: int = ...
+    pipeline_parallel_size: int = ...
+    quantization: str | None = ...
+    load_format: str = ...
+    enable_sleep_mode: bool = ...
diff --git a/.cuda_typings/vllm/outputs.pyi b/.cuda_typings/vllm/outputs.pyi
@@ -0,0 +1,17 @@
+class CompletionOutput:
+    index: int
+    text: str
+    token_ids: list[int]
+    cumulative_logprob: float | None
+    logprobs: object | None
+    finish_reason: str | None
+    stop_reason: int | str | None
+
+    def finished(self) -> bool: ...
+
+class RequestOutput:
+    request_id: str
+    prompt: str | None
+    prompt_token_ids: list[int] | None
+    outputs: list[CompletionOutput]
+    finished: bool
diff --git a/.cuda_typings/vllm/sampling_params.pyi b/.cuda_typings/vllm/sampling_params.pyi
@@ -0,0 +1,11 @@
+class SamplingParams:
+    n: int
+    temperature: float
+    top_p: float
+    top_k: int
+    min_p: float
+    seed: int | None
+    stop: str | list[str] | None
+    max_tokens: int | None
+    logprobs: int | None
+    repetition_penalty: float
diff --git a/.cuda_typings/vllm/tokenizers/__init__.pyi b/.cuda_typings/vllm/tokenizers/__init__.pyi
@@ -0,0 +1,3 @@
+from vllm.tokenizers.protocol import TokenizerLike
+
+__all__ = ["TokenizerLike"]
diff --git a/.cuda_typings/vllm/tokenizers/protocol.pyi b/.cuda_typings/vllm/tokenizers/protocol.pyi
@@ -0,0 +1,15 @@
+from typing import Protocol
+
+class TokenizerLike(Protocol):
+    @property
+    def eos_token_id(self) -> int: ...
+    @property
+    def vocab_size(self) -> int: ...
+    def encode(self, text: str, add_special_tokens: bool = ...) -> list[int]: ...
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = ...) -> str: ...
+    def apply_chat_template(
+        self,
+        messages: list[dict[str, str]],
+        tools: list[dict[str, object]] | None = ...,
+        **kwargs: object,
+    ) -> str | list[int]: ...
diff --git a/.cuda_typings/vllm/v1/__init__.pyi b/.cuda_typings/vllm/v1/__init__.pyi
@@ -0,0 +1 @@
+
diff --git a/.cuda_typings/vllm/v1/core/__init__.pyi b/.cuda_typings/vllm/v1/core/__init__.pyi
@@ -0,0 +1 @@
+
diff --git a/.cuda_typings/vllm/v1/core/kv_cache_manager.pyi b/.cuda_typings/vllm/v1/core/kv_cache_manager.pyi
@@ -0,0 +1,24 @@
+from collections.abc import Sequence
+
+from vllm.v1.core.kv_cache_utils import BlockPool, KVCacheBlock
+from vllm.v1.kv_cache_interface import KVCacheConfig
+
+class KVCacheBlocks:
+    blocks: tuple[Sequence[KVCacheBlock], ...]
+    def __init__(self, blocks: tuple[Sequence[KVCacheBlock], ...]) -> None: ...
+    def get_block_ids(self) -> tuple[list[int], ...]: ...
+
+class KVCacheManager:
+    block_pool: BlockPool
+    kv_cache_config: KVCacheConfig
+    enable_caching: bool
+    num_kv_cache_groups: int
+    coordinator: object
+    def __init__(self, *args: object, **kwargs: object) -> None: ...
+    def allocate_slots(
+        self, request: object, num_new_tokens: int, *args: object, **kwargs: object
+    ) -> KVCacheBlocks | None: ...
+    def get_computed_blocks(self, request: object) -> tuple[KVCacheBlocks, int]: ...
+    def create_kv_cache_blocks(
+        self, blocks: tuple[list[KVCacheBlock], ...]
+    ) -> KVCacheBlocks: ...
diff --git a/.cuda_typings/vllm/v1/core/kv_cache_utils.pyi b/.cuda_typings/vllm/v1/core/kv_cache_utils.pyi
@@ -0,0 +1,16 @@
+class KVCacheBlock:
+    block_id: int
+    ref_cnt: int
+    def __init__(self, block_id: int) -> None: ...
+
+class FreeKVCacheBlockQueue:
+    def append_n(self, blocks: list[KVCacheBlock]) -> None: ...
+    def popleft_n(self, n: int) -> list[KVCacheBlock]: ...
+
+class BlockPool:
+    blocks: list[KVCacheBlock]
+    free_block_queue: FreeKVCacheBlockQueue
+    num_gpu_blocks: int
+    enable_caching: bool
+    def get_num_free_blocks(self) -> int: ...
+    def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]: ...
diff --git a/.cuda_typings/vllm/v1/engine/__init__.pyi b/.cuda_typings/vllm/v1/engine/__init__.pyi
diff --git a/.cuda_typings/vllm/v1/engine/llm_engine.pyi b/.cuda_typings/vllm/v1/engine/llm_engine.pyi
@@ -0,0 +1,22 @@
+from vllm.config import ModelConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers import TokenizerLike
+
+class LLMEngine:
+    tokenizer: TokenizerLike | None
+    model_config: ModelConfig
+
+    @classmethod
+    def from_engine_args(cls, engine_args: EngineArgs) -> LLMEngine: ...
+    def add_request(
+        self,
+        request_id: str,
+        prompt: str,
+        params: SamplingParams,
+        arrival_time: float | None = ...,
+    ) -> None: ...
+    def step(self) -> list[RequestOutput]: ...
+    def has_unfinished_requests(self) -> bool: ...
+    def get_tokenizer(self) -> TokenizerLike: ...
diff --git a/.cuda_typings/vllm/v1/kv_cache_interface.pyi b/.cuda_typings/vllm/v1/kv_cache_interface.pyi
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+
+@dataclass
+class KVCacheSpec:
+    block_size: int
+    num_kv_heads: int
+    head_size: int
+
+@dataclass
+class KVCacheGroupSpec:
+    layer_names: list[str]
+    kv_cache_spec: KVCacheSpec
+
+@dataclass
+class KVCacheConfig:
+    num_blocks: int
+    kv_cache_groups: list[KVCacheGroupSpec]
diff --git a/.cuda_typings/vllm/v1/request.pyi b/.cuda_typings/vllm/v1/request.pyi
@@ -0,0 +1,6 @@
+class Request:
+    request_id: str
+    prompt_token_ids: list[int] | None
+    num_prompt_tokens: int
+    num_computed_tokens: int
+    num_tokens: int
diff --git a/.cuda_typings/vllm/v1/worker/__init__.pyi b/.cuda_typings/vllm/v1/worker/__init__.pyi
@@ -0,0 +1 @@
+
diff --git a/.cuda_typings/vllm/v1/worker/gpu_model_runner.pyi b/.cuda_typings/vllm/v1/worker/gpu_model_runner.pyi
@@ -0,0 +1,24 @@
+import torch
+
+class _CompilationConfig:
+    static_forward_context: dict[str, object]
+
+class _ModelConfig:
+    hf_config: object
+
+class GPUModelRunner:
+    kv_caches: list[torch.Tensor]
+    compilation_config: _CompilationConfig
+    model_config: _ModelConfig | None
+    def _allocate_kv_cache_tensors(
+        self, kv_cache_config: object
+    ) -> dict[str, torch.Tensor]: ...
+    def initialize_kv_cache_tensors(
+        self, kv_cache_config: object, kernel_block_sizes: list[int]
+    ) -> dict[str, torch.Tensor]: ...
+    def _reshape_kv_cache_tensors(
+        self,
+        kv_cache_config: object,
+        raw_tensors: dict[str, torch.Tensor],
+        kernel_block_sizes: list[int],
+    ) -> dict[str, torch.Tensor]: ...
diff --git a/.cuda_typings/vllm/v1/worker/gpu_worker.pyi b/.cuda_typings/vllm/v1/worker/gpu_worker.pyi
@@ -0,0 +1,6 @@
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+class Worker:
+    model_runner: GPUModelRunner
+    def determine_available_memory(self) -> int: ...
+    def initialize_from_config(self, kv_cache_config: object) -> None: ...
diff --git a/.cuda_typings/vllm/v1/worker/utils.pyi b/.cuda_typings/vllm/v1/worker/utils.pyi
@@ -0,0 +1 @@
+def extract_layer_index(layer_name: str, num_attn_module: int) -> int: ...
diff --git a/bench/eval_tool_calls.py b/bench/eval_tool_calls.py
@@ -17,11 +17,13 @@
     ExoClient,
     ExoHttpError,
     add_common_instance_args,
+    ensure_cuda_available,
     instance_id_from_instance,
     nodes_used_in_instance,
     resolve_model_short_id,
     run_planning_phase,
     settle_and_fetch_placements,
+    validate_vllm_args,
     wait_for_instance_gone,
     wait_for_instance_ready,
 )
@@ -933,6 +935,7 @@ def main() -> None:
         help="Write JSON results to stdout instead of file",
     )
     args = parser.parse_args()
+    validate_vllm_args(args)
 
     all_scenarios = load_scenarios(SCENARIOS_PATH)
     if args.scenarios:
@@ -952,6 +955,8 @@ def main() -> None:
 
     log = sys.stderr if args.stdout else sys.stdout
     exo = ExoClient(args.host, args.port, timeout_s=args.timeout)
+    if args.ensure_cuda:
+        ensure_cuda_available(exo)
     _short_id, full_model_id = resolve_model_short_id(exo, args.model)
 
     selected = settle_and_fetch_placements(

diff --git a/bench/exo_bench.py b/bench/exo_bench.py
@@ -33,11 +33,13 @@
     ExoClient,
     ExoHttpError,
     add_common_instance_args,
+    ensure_cuda_available,
     instance_id_from_instance,
     nodes_used_in_instance,
     resolve_model_short_id,
     run_planning_phase,
     settle_and_fetch_placements,
+    validate_vllm_args,
     wait_for_instance_gone,
     wait_for_instance_ready,
 )
@@ -280,6 +282,7 @@ def main() -> int:
         help="Force all pp×tg combinations (cartesian product) even when lists have equal length.",
     )
     args = ap.parse_args()
+    validate_vllm_args(args)
 
     pp_list = parse_int_list(args.pp)
     tg_list = parse_int_list(args.tg)
@@ -304,6 +307,8 @@ def main() -> int:
         logger.info(f"pp/tg mode: tandem (zip) - {len(pp_list)} pairs")
 
     client = ExoClient(args.host, args.port, timeout_s=args.timeout)
+    if args.ensure_cuda:
+        ensure_cuda_available(client)
     short_id, full_model_id = resolve_model_short_id(
         client, args.model, force_download=args.force_download
     )

diff --git a/bench/exo_eval.py b/bench/exo_eval.py
@@ -46,11 +46,13 @@
     ExoClient,
     ExoHttpError,
     add_common_instance_args,
+    ensure_cuda_available,
     instance_id_from_instance,
     nodes_used_in_instance,
     resolve_model_short_id,
     run_planning_phase,
     settle_and_fetch_placements,
+    validate_vllm_args,
     wait_for_instance_gone,
     wait_for_instance_ready,
 )
@@ -1157,6 +1159,7 @@ def main() -> int:
     )
 
     args, _ = ap.parse_known_args()
+    validate_vllm_args(args)
 
     # Resolve tasks
     if args.tasks:
@@ -1173,6 +1176,8 @@ def main() -> int:
 
     # Instance management
     client = ExoClient(args.host, args.port, timeout_s=args.timeout)
+    if args.ensure_cuda:
+        ensure_cuda_available(client)
     instance_id: str | None = None
 
     if not args.skip_instance_setup:

diff --git a/bench/harness.py b/bench/harness.py
@@ -196,6 +196,31 @@ def resolve_model_short_id(
     raise ValueError(f"Model not found in /models: {model_arg}")
 
 
+def validate_vllm_args(args: argparse.Namespace) -> None:
+    if args.instance_meta != "vllm":
+        return
+    if args.sharding == "tensor":
+        raise SystemExit(
+            "--instance-meta vllm is incompatible with --sharding tensor (vllm is pipeline-only)"
+        )
+    if args.min_nodes > 1:
+        raise SystemExit(
+            "--instance-meta vllm is incompatible with --min-nodes > 1 (vllm is single-node)"
+        )
+    if args.max_nodes > 1:
+        raise SystemExit(
+            "--instance-meta vllm is incompatible with --max-nodes > 1 (vllm is single-node)"
+        )
+
+
+def ensure_cuda_available(client: ExoClient) -> None:
+    capabilities = client.request_json("GET", "/capabilities")
+    if not capabilities or not capabilities.get("vllm_available"):
+        raise SystemExit(
+            "--ensure-cuda: vllm is not available on the exo cluster (no CUDA capability)"
+        )
+
+
 def placement_filter(instance_meta: str, wanted: str) -> bool:
     s = (instance_meta or "").lower()
     if wanted == "both":
@@ -475,7 +500,7 @@ def add_common_instance_args(ap: argparse.ArgumentParser) -> None:
         help="Only consider placements using >= this many nodes.",
     )
     ap.add_argument(
-        "--instance-meta", choices=["ring", "jaccl", "both"], default="both"
+        "--instance-meta", choices=["ring", "jaccl", "vllm", "both"], default="both"
     )
     ap.add_argument(
         "--sharding", choices=["pipeline", "tensor", "both"], default="both"
@@ -504,3 +529,8 @@ def add_common_instance_args(ap: argparse.ArgumentParser) -> None:
         action="store_true",
         help="Delete existing models from smallest to largest to make room for benchmark model.",
     )
+    ap.add_argument(
+        "--ensure-cuda",
+        action="store_true",
+        help="Verify the exo cluster has CUDA/vllm capability; error if not.",
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		def is_initialized() -> bool: ...
		def destroy_process_group() -> None: ...
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from vllm.tokenizers.protocol import TokenizerLike

		__all__ = ["TokenizerLike"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		def extract_layer_index(layer_name: str, num_attn_module: int) -> int: ...