Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
101c31a
Clean final changes
Masseeh May 1, 2026
c80ab3a
add length penalty
Masseeh May 2, 2026
a577268
add difficulty-aware penalty
Masseeh May 2, 2026
63eb59b
new ray
Masseeh May 5, 2026
55368e1
fix timeout
Masseeh May 5, 2026
f4b50fc
match tir and cube function calls
Masseeh May 5, 2026
0d04b10
move cube-rl inside pipelinerl package
Masseeh May 5, 2026
263ab86
refactor based on new llm_router
Masseeh May 5, 2026
80ccec8
fix agent config
Masseeh May 6, 2026
5378cd4
remove top_k
Masseeh May 6, 2026
c0e6c45
flushing wandb logs
Masseeh May 7, 2026
73dd4b3
add chapt template for results viewer
Masseeh May 7, 2026
7918110
Makefile
Masseeh May 7, 2026
4692834
disentangle number of ray workers from llm_max_rollouts
Masseeh May 8, 2026
18a3654
multi-replica support
Masseeh May 8, 2026
7ac8bea
update example
Masseeh May 8, 2026
2459ed7
Refactor cube rollouts around lazy workers
Masseeh May 8, 2026
0c8194a
Configure cube math as multi-cube workload
Masseeh May 8, 2026
4aaa84c
fix lag during eval
Masseeh May 10, 2026
1ab614f
implemented max_lag for CUBE in
Masseeh May 10, 2026
1d5537d
add ray cluster for multi-node
Masseeh May 11, 2026
699d793
faster startup process for cube
Masseeh May 11, 2026
647f03b
improve ray teardown logic
Masseeh May 11, 2026
e502aeb
Surface cube rollout routing metadata
Masseeh May 12, 2026
ef7a01f
Stop tracking debugging tools
Masseeh May 13, 2026
03cf3f8
Stop tracking Makefile
Masseeh May 13, 2026
b0a73d3
add rollout-level vLLM affinity routing
Masseeh May 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,11 @@ package-lock.json
package.json
results
results/
results-batch-jobs
results-batch-jobs/
data/
cache/
dump.rdb
scripts/
results_viewer/
Makefile
15 changes: 15 additions & 0 deletions conf/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ finetune:
seed: ${..seed}

actor:
launcher: asyncio
ray_address: null
launch_ray_cluster: false
ray_head_port: 6379
ray_node_manager_port: 6380
ray_object_manager_port: 6381
ray_worker_port_start: 20000
ray_worker_port_count: null
ray_num_cpus_per_node: null
ray_extra_cpus_per_node: 1
log_each_n_secs: 0
llm_max_rollouts: 64
rollout_workers: 1
Expand All @@ -22,6 +32,10 @@ actor:
result_queue_size: 64
throughput_window_size: 50
shared_memory_entry_size: 10000000
difficulty_aware_penalty:
enabled: false
gamma: 0.5
failure_scale: 0.5
environment: null
preprocess:
input: actor
Expand Down Expand Up @@ -113,6 +127,7 @@ pop_old_data: true
max_lag: null
attempts: ${finetune.attempts}
train_subset: null
test_subset: null
debug:
mode: ""
streams_from: null
Expand Down
115 changes: 115 additions & 0 deletions conf/cube_math_tool.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
defaults:
- base
- override rewards: success_and_format
- _self_

output_dir: results/cube_math_tool/${now:%Y-%m-%d}/${now:%H-%M-%S}
model_path: /mnt/llmd/base_models/Qwen2.5-7B-Instruct
litellm_logging_level: info
ray_debug: 0
ray_local_mode: false

actor:
launcher: ray
ray_num_cpus: null
cube_workers_num_cpus: 1.0
cube_eval_workers_fraction: 0.5
cube_workers: 128
llm_max_rollouts: 128
ray_worker_log_enabled: true
ray_worker_log_path: null
ray_worker_log_level: WARNING
ray_worker_litellm_log_level: CRITICAL

llm:
parameters:
max_tokens: 16000
max_completion_tokens: 16000
temperature: 1.0

test_llm:
parameters:
max_tokens: 16000
max_completion_tokens: 16000
temperature: 1.0
top_p: 0.95

vllm_config:
vllm_kwargs:
max_model_len: 32000
served_model_name: Qwen2.5-7B-Instruct
enable-auto-tool-choice: ""
tool-call-parser: rl_tool
tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py

finetune:
seq_length: 32000
seq_parallel: 8
gradient_accumulation_passes: 1024
rl:
policy_loss: gspo
overlong_filtering: true

preprocess:
input: actor
output: training_data
n_workers: 8
shared_memory_entry_size: 1000000000

cube_params:
resource_guard:
actor_memory_gb: 1.25
memory_overhead_gb: 8.0
memory_usage_threshold: 0.90
seed: ${seed}
cubes:
- id: open_reasoner_zero_57k
split: train
dataset_name: open_reasoner_zero_57k
benchmark: &math_tool_benchmark
_target_: math_tool_use.benchmark.MathToolUseBenchmark
default_tool_config:
_target_: math_tool_use.tool.MathToolUseToolConfig
sandbox_endpoint: http://dns-24e3447c-506e-4b21-92df-156e18db5087-sandboxfusion
agent: &tir_agent
_target_: cube_harness.agents.tir.TirAgentConfig
llm_config:
_target_: cube_harness.llm.RoutedLLMConfig
model_name: ${vllm_config.vllm_kwargs.served_model_name}
tokenizer_name: ${model_path}
timeout: 3600.0
num_retries: 1
extra_body:
return_token_ids: true
system_prompt: |
You are a math-focused AI Agent. Solve problems by combining clear symbolic reasoning
with short, deterministic Python code.
Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
Always present the final answer in LaTeX \boxed{}.
Do not express emotions or opinions about user questions.

Workflow:
1. Draft a brief plan in plain text.
2. Execute one run_python_code call to compute or verify the result.
3. Finalize by calling MathAnswer with the LaTeX-formatted answer.

Python execution policy (run_python_code):
- Use Python strictly for pure computation to verify and validate the final answer.
- No network, file system, OS or environment access.
- Keep snippets minimal and self-contained; print only the final result.

Validation:
- Cross-check results (alternative derivation, invariants, higher precision) before finalizing.
- If execution fails, propose the minimal fix and retry.
Always verify with run_python_code before invoking MathAnswer.
max_actions: 3
- id: open_reasoner_zero_extended_72k
split: train
dataset_name: open_reasoner_zero_extended_72k
benchmark: *math_tool_benchmark
agent: *tir_agent
- id: aime_2025
split: test
dataset_name: aime_2025
benchmark: *math_tool_benchmark
agent: *tir_agent
4 changes: 4 additions & 0 deletions conf/tir.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,14 @@ rewards:
environments:
- key: math
mode: remote
replicas_per_actor: ${world.env_replicas_per_actor}
_target_: pipelinerl.domains.math.MathEnvironment
environment_key: math
dataset_loader: pipelinerl.domains.math.load_datasets

world:
env_replicas_per_actor: 1

train_dataset_names:
- open_reasoner_zero_57k
- open_reasoner_zero_extended_72k
Expand Down
Loading