Skip to content
Draft
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
cfc8f09
Fast direct USB connectivity
rltakashige Mar 10, 2026
5e9d27b
Show Sparks and Linux in topology
rltakashige Mar 10, 2026
6be6ea5
Fix placement preview
rltakashige Mar 10, 2026
ca5870a
Some Linux Laptop/Desktop detection and goodbye penguin
rltakashige Mar 10, 2026
34df811
Vibe coding design baby
rltakashige Mar 10, 2026
659c1bc
Progress: Run EXO-CUDA through nix!
rltakashige Mar 11, 2026
9a83fa6
Patch VLLM to load multiple models dynamically
rltakashige Mar 11, 2026
ba35a4b
Move VLLM into the runner and add type stubs
rltakashige Mar 11, 2026
0c8615f
Only import VLLM once..
rltakashige Mar 11, 2026
3e097f7
only download a single copy of the model.
rltakashige Mar 11, 2026
404b976
Download models without model.safetensors.index
rltakashige Mar 11, 2026
2683ac7
Add Torch typings
rltakashige Mar 11, 2026
f75d36c
Fix cache patch
rltakashige Mar 11, 2026
9ee23ee
Make vllm inference runner closer to the normal inference runner
rltakashige Mar 11, 2026
8f94727
Ignore missing modules if type stubs exist
rltakashige Mar 11, 2026
1331465
Add missing runner features
rltakashige Mar 12, 2026
3f4f7c9
Only do for aarch64 linux
rltakashige Mar 12, 2026
4b6dd75
lockgit status
rltakashige Mar 12, 2026
957ebbd
Set max token length as max context length if no max tokens set
rltakashige Mar 12, 2026
87b7c5e
Pass CI
rltakashige Mar 12, 2026
5d7a005
Destroy process group on Keyboard Interrupt
rltakashige Mar 12, 2026
585dfe3
Merge branch 'main' into leo/dgx-spark-integrations
rltakashige Mar 12, 2026
3503011
ExoBench and ExoEval for CUDA
rltakashige Mar 12, 2026
283b180
Move VLLM runner into VLLM engine
rltakashige Mar 12, 2026
493e342
Skip impossible shardings
rltakashige Mar 12, 2026
7bb5cb4
Allow memory profiling to be unstable
rltakashige Mar 12, 2026
4a7901c
Fix patches
rltakashige Mar 12, 2026
169ea2a
Fix GPT OSS by not retokenizing prompts
rltakashige Mar 12, 2026
e9e23e5
Have loading progress
rltakashige Mar 12, 2026
594ed99
new uv lock for fastsafetensors
rltakashige Mar 13, 2026
3c29d0d
test prefix caching
rltakashige Mar 13, 2026
073f8c1
add batching
rltakashige Mar 16, 2026
dc68ddb
Prompt formatting
rltakashige Mar 16, 2026
e96f084
Distributed callbacks
rltakashige Mar 16, 2026
ec5d62f
Strip vllm generator
rltakashige Mar 16, 2026
04dcdbd
Merge main
rltakashige Mar 16, 2026
8cd1308
Tidy pass 1
rltakashige Mar 16, 2026
72cd855
Merge branch 'main' into leo/dgx-spark-integrations
rltakashige Mar 17, 2026
c70d900
Address comments including task id interface
rltakashige Mar 17, 2026
e78e53d
Address comments 2 including vllm capability in state
rltakashige Mar 17, 2026
e1df77b
No more future annotations
rltakashige Mar 17, 2026
6a3eb2f
close()
rltakashige Mar 17, 2026
cacd26e
Type error lol
rltakashige Mar 17, 2026
1dd9c28
Address comments 3, mainly refactors
rltakashige Mar 17, 2026
655185c
Address comments 4 - defer to the warmup into the exo batch generator…
rltakashige Mar 17, 2026
be731d3
Merge main
rltakashige Mar 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .cuda_typings/openai_harmony/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from enum import Enum

class HarmonyEncodingName(Enum):
HARMONY_GPT_OSS = ...

class HarmonyEncoding: ...
class HarmonyError(Exception): ...

class Role(Enum):
ASSISTANT = ...

class StreamableParser:
last_content_delta: str
current_channel: str | None
current_recipient: str | None

def __init__(self, encoding: HarmonyEncoding, role: Role = ...) -> None: ...
def process(self, token_id: int) -> None: ...

def load_harmony_encoding(name: HarmonyEncodingName) -> HarmonyEncoding: ...
8 changes: 8 additions & 0 deletions .cuda_typings/torch/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from torch import backends as backends
from torch import cuda as cuda
from torch import distributed as distributed

__version__: str

class version:
cuda: str
1 change: 1 addition & 0 deletions .cuda_typings/torch/backends/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from torch.backends import cuda as cuda
1 change: 1 addition & 0 deletions .cuda_typings/torch/backends/cuda/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
def is_built() -> bool: ...
8 changes: 8 additions & 0 deletions .cuda_typings/torch/cuda/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
class _DeviceProperties:
total_memory: int

def is_available() -> bool: ...
def get_device_name(device: int) -> str: ...
def get_device_properties(device: int) -> _DeviceProperties: ...
def empty_cache() -> None: ...
def mem_get_info() -> tuple[int, int]: ...
2 changes: 2 additions & 0 deletions .cuda_typings/torch/distributed/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def is_initialized() -> bool: ...
def destroy_process_group() -> None: ...
1 change: 1 addition & 0 deletions .cuda_typings/vllm/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__: str
2 changes: 2 additions & 0 deletions .cuda_typings/vllm/config.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class ModelConfig:
max_model_len: int
Empty file.
18 changes: 18 additions & 0 deletions .cuda_typings/vllm/engine/arg_utils.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dataclasses import dataclass

@dataclass
class EngineArgs:
model: str = ...
served_model_name: str | list[str] | None = ...
tokenizer: str | None = ...
trust_remote_code: bool = ...
dtype: str = ...
seed: int = ...
max_model_len: int | None = ...
gpu_memory_utilization: float = ...
enforce_eager: bool = ...
tensor_parallel_size: int = ...
pipeline_parallel_size: int = ...
quantization: str | None = ...
load_format: str = ...
enable_sleep_mode: bool = ...
17 changes: 17 additions & 0 deletions .cuda_typings/vllm/outputs.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
class CompletionOutput:
index: int
text: str
token_ids: list[int]
cumulative_logprob: float | None
logprobs: object | None
finish_reason: str | None
stop_reason: int | str | None

def finished(self) -> bool: ...

class RequestOutput:
request_id: str
prompt: str | None
prompt_token_ids: list[int] | None
outputs: list[CompletionOutput]
finished: bool
11 changes: 11 additions & 0 deletions .cuda_typings/vllm/sampling_params.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
class SamplingParams:
n: int
temperature: float
top_p: float
top_k: int
min_p: float
seed: int | None
stop: str | list[str] | None
max_tokens: int | None
logprobs: int | None
repetition_penalty: float
3 changes: 3 additions & 0 deletions .cuda_typings/vllm/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from vllm.tokenizers.protocol import TokenizerLike

__all__ = ["TokenizerLike"]
15 changes: 15 additions & 0 deletions .cuda_typings/vllm/tokenizers/protocol.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Protocol

class TokenizerLike(Protocol):
@property
def eos_token_id(self) -> int: ...
@property
def vocab_size(self) -> int: ...
def encode(self, text: str, add_special_tokens: bool = ...) -> list[int]: ...
def decode(self, ids: list[int] | int, skip_special_tokens: bool = ...) -> str: ...
def apply_chat_template(
self,
messages: list[dict[str, str]],
tools: list[dict[str, object]] | None = ...,
**kwargs: object,
) -> str | list[int]: ...
1 change: 1 addition & 0 deletions .cuda_typings/vllm/v1/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions .cuda_typings/vllm/v1/core/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

24 changes: 24 additions & 0 deletions .cuda_typings/vllm/v1/core/kv_cache_manager.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from collections.abc import Sequence

from vllm.v1.core.kv_cache_utils import BlockPool, KVCacheBlock
from vllm.v1.kv_cache_interface import KVCacheConfig

class KVCacheBlocks:
blocks: tuple[Sequence[KVCacheBlock], ...]
def __init__(self, blocks: tuple[Sequence[KVCacheBlock], ...]) -> None: ...
def get_block_ids(self) -> tuple[list[int], ...]: ...

class KVCacheManager:
block_pool: BlockPool
kv_cache_config: KVCacheConfig
enable_caching: bool
num_kv_cache_groups: int
coordinator: object
def __init__(self, *args: object, **kwargs: object) -> None: ...
def allocate_slots(
self, request: object, num_new_tokens: int, *args: object, **kwargs: object
) -> KVCacheBlocks | None: ...
def get_computed_blocks(self, request: object) -> tuple[KVCacheBlocks, int]: ...
def create_kv_cache_blocks(
self, blocks: tuple[list[KVCacheBlock], ...]
) -> KVCacheBlocks: ...
16 changes: 16 additions & 0 deletions .cuda_typings/vllm/v1/core/kv_cache_utils.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
class KVCacheBlock:
block_id: int
ref_cnt: int
def __init__(self, block_id: int) -> None: ...

class FreeKVCacheBlockQueue:
def append_n(self, blocks: list[KVCacheBlock]) -> None: ...
def popleft_n(self, n: int) -> list[KVCacheBlock]: ...

class BlockPool:
blocks: list[KVCacheBlock]
free_block_queue: FreeKVCacheBlockQueue
num_gpu_blocks: int
enable_caching: bool
def get_num_free_blocks(self) -> int: ...
def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]: ...
Empty file.
22 changes: 22 additions & 0 deletions .cuda_typings/vllm/v1/engine/llm_engine.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from vllm.config import ModelConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import TokenizerLike

class LLMEngine:
tokenizer: TokenizerLike | None
model_config: ModelConfig

@classmethod
def from_engine_args(cls, engine_args: EngineArgs) -> LLMEngine: ...
def add_request(
self,
request_id: str,
prompt: str,
params: SamplingParams,
arrival_time: float | None = ...,
) -> None: ...
def step(self) -> list[RequestOutput]: ...
def has_unfinished_requests(self) -> bool: ...
def get_tokenizer(self) -> TokenizerLike: ...
17 changes: 17 additions & 0 deletions .cuda_typings/vllm/v1/kv_cache_interface.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from dataclasses import dataclass

@dataclass
class KVCacheSpec:
block_size: int
num_kv_heads: int
head_size: int

@dataclass
class KVCacheGroupSpec:
layer_names: list[str]
kv_cache_spec: KVCacheSpec

@dataclass
class KVCacheConfig:
num_blocks: int
kv_cache_groups: list[KVCacheGroupSpec]
6 changes: 6 additions & 0 deletions .cuda_typings/vllm/v1/request.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class Request:
request_id: str
prompt_token_ids: list[int] | None
num_prompt_tokens: int
num_computed_tokens: int
num_tokens: int
1 change: 1 addition & 0 deletions .cuda_typings/vllm/v1/worker/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

24 changes: 24 additions & 0 deletions .cuda_typings/vllm/v1/worker/gpu_model_runner.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import torch

class _CompilationConfig:
static_forward_context: dict[str, object]

class _ModelConfig:
hf_config: object

class GPUModelRunner:
kv_caches: list[torch.Tensor]
compilation_config: _CompilationConfig
model_config: _ModelConfig | None
def _allocate_kv_cache_tensors(
self, kv_cache_config: object
) -> dict[str, torch.Tensor]: ...
def initialize_kv_cache_tensors(
self, kv_cache_config: object, kernel_block_sizes: list[int]
) -> dict[str, torch.Tensor]: ...
def _reshape_kv_cache_tensors(
self,
kv_cache_config: object,
raw_tensors: dict[str, torch.Tensor],
kernel_block_sizes: list[int],
) -> dict[str, torch.Tensor]: ...
6 changes: 6 additions & 0 deletions .cuda_typings/vllm/v1/worker/gpu_worker.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from vllm.v1.worker.gpu_model_runner import GPUModelRunner

class Worker:
model_runner: GPUModelRunner
def determine_available_memory(self) -> int: ...
def initialize_from_config(self, kv_cache_config: object) -> None: ...
1 change: 1 addition & 0 deletions .cuda_typings/vllm/v1/worker/utils.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
def extract_layer_index(layer_name: str, num_attn_module: int) -> int: ...
5 changes: 5 additions & 0 deletions bench/eval_tool_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
ExoClient,
ExoHttpError,
add_common_instance_args,
ensure_cuda_available,
instance_id_from_instance,
nodes_used_in_instance,
resolve_model_short_id,
run_planning_phase,
settle_and_fetch_placements,
validate_vllm_args,
wait_for_instance_gone,
wait_for_instance_ready,
)
Expand Down Expand Up @@ -933,6 +935,7 @@ def main() -> None:
help="Write JSON results to stdout instead of file",
)
args = parser.parse_args()
validate_vllm_args(args)

all_scenarios = load_scenarios(SCENARIOS_PATH)
if args.scenarios:
Expand All @@ -952,6 +955,8 @@ def main() -> None:

log = sys.stderr if args.stdout else sys.stdout
exo = ExoClient(args.host, args.port, timeout_s=args.timeout)
if args.ensure_cuda:
ensure_cuda_available(exo)
_short_id, full_model_id = resolve_model_short_id(exo, args.model)

selected = settle_and_fetch_placements(
Expand Down
5 changes: 5 additions & 0 deletions bench/exo_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@
ExoClient,
ExoHttpError,
add_common_instance_args,
ensure_cuda_available,
instance_id_from_instance,
nodes_used_in_instance,
resolve_model_short_id,
run_planning_phase,
settle_and_fetch_placements,
validate_vllm_args,
wait_for_instance_gone,
wait_for_instance_ready,
)
Expand Down Expand Up @@ -280,6 +282,7 @@ def main() -> int:
help="Force all pp×tg combinations (cartesian product) even when lists have equal length.",
)
args = ap.parse_args()
validate_vllm_args(args)

pp_list = parse_int_list(args.pp)
tg_list = parse_int_list(args.tg)
Expand All @@ -304,6 +307,8 @@ def main() -> int:
logger.info(f"pp/tg mode: tandem (zip) - {len(pp_list)} pairs")

client = ExoClient(args.host, args.port, timeout_s=args.timeout)
if args.ensure_cuda:
ensure_cuda_available(client)
short_id, full_model_id = resolve_model_short_id(
client, args.model, force_download=args.force_download
)
Expand Down
5 changes: 5 additions & 0 deletions bench/exo_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@
ExoClient,
ExoHttpError,
add_common_instance_args,
ensure_cuda_available,
instance_id_from_instance,
nodes_used_in_instance,
resolve_model_short_id,
run_planning_phase,
settle_and_fetch_placements,
validate_vllm_args,
wait_for_instance_gone,
wait_for_instance_ready,
)
Expand Down Expand Up @@ -1157,6 +1159,7 @@ def main() -> int:
)

args, _ = ap.parse_known_args()
validate_vllm_args(args)

# Resolve tasks
if args.tasks:
Expand All @@ -1173,6 +1176,8 @@ def main() -> int:

# Instance management
client = ExoClient(args.host, args.port, timeout_s=args.timeout)
if args.ensure_cuda:
ensure_cuda_available(client)
instance_id: str | None = None

if not args.skip_instance_setup:
Expand Down
32 changes: 31 additions & 1 deletion bench/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,31 @@ def resolve_model_short_id(
raise ValueError(f"Model not found in /models: {model_arg}")


def validate_vllm_args(args: argparse.Namespace) -> None:
if args.instance_meta != "vllm":
return
if args.sharding == "tensor":
raise SystemExit(
"--instance-meta vllm is incompatible with --sharding tensor (vllm is pipeline-only)"
)
if args.min_nodes > 1:
raise SystemExit(
"--instance-meta vllm is incompatible with --min-nodes > 1 (vllm is single-node)"
)
if args.max_nodes > 1:
raise SystemExit(
"--instance-meta vllm is incompatible with --max-nodes > 1 (vllm is single-node)"
)


def ensure_cuda_available(client: ExoClient) -> None:
capabilities = client.request_json("GET", "/capabilities")
if not capabilities or not capabilities.get("vllm_available"):
raise SystemExit(
"--ensure-cuda: vllm is not available on the exo cluster (no CUDA capability)"
)


def placement_filter(instance_meta: str, wanted: str) -> bool:
s = (instance_meta or "").lower()
if wanted == "both":
Expand Down Expand Up @@ -475,7 +500,7 @@ def add_common_instance_args(ap: argparse.ArgumentParser) -> None:
help="Only consider placements using >= this many nodes.",
)
ap.add_argument(
"--instance-meta", choices=["ring", "jaccl", "both"], default="both"
"--instance-meta", choices=["ring", "jaccl", "vllm", "both"], default="both"
)
ap.add_argument(
"--sharding", choices=["pipeline", "tensor", "both"], default="both"
Expand Down Expand Up @@ -504,3 +529,8 @@ def add_common_instance_args(ap: argparse.ArgumentParser) -> None:
action="store_true",
help="Delete existing models from smallest to largest to make room for benchmark model.",
)
ap.add_argument(
"--ensure-cuda",
action="store_true",
help="Verify the exo cluster has CUDA/vllm capability; error if not.",
)
Loading
Loading