ServiceNow · Masseeh · May 1, 2026 · May 2, 2026 · May 2, 2026 · May 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -183,7 +183,11 @@ package-lock.json
 package.json
 results
 results/
+results-batch-jobs
+results-batch-jobs/
 data/
 cache/
 dump.rdb
 scripts/
+results_viewer/
+Makefile
diff --git a/conf/base.yaml b/conf/base.yaml
@@ -14,6 +14,16 @@ finetune:
   seed: ${..seed}
 
 actor:
+  launcher: asyncio
+  ray_address: null
+  launch_ray_cluster: false
+  ray_head_port: 6379
+  ray_node_manager_port: 6380
+  ray_object_manager_port: 6381
+  ray_worker_port_start: 20000
+  ray_worker_port_count: null
+  ray_num_cpus_per_node: null
+  ray_extra_cpus_per_node: 1
   log_each_n_secs: 0
   llm_max_rollouts: 64
   rollout_workers: 1
@@ -22,6 +32,10 @@ actor:
   result_queue_size: 64
   throughput_window_size: 50
   shared_memory_entry_size: 10000000
+  difficulty_aware_penalty:
+    enabled: false
+    gamma: 0.5
+    failure_scale: 0.5
 environment: null
 preprocess:
   input: actor
@@ -113,6 +127,7 @@ pop_old_data: true
 max_lag: null
 attempts: ${finetune.attempts}
 train_subset: null
+test_subset: null
 debug:
   mode: ""
   streams_from: null

diff --git a/conf/cube_math_tool.yaml b/conf/cube_math_tool.yaml
@@ -0,0 +1,115 @@
+defaults:
+  - base
+  - override rewards: success_and_format
+  - _self_
+
+output_dir: results/cube_math_tool/${now:%Y-%m-%d}/${now:%H-%M-%S}
+model_path: /mnt/llmd/base_models/Qwen2.5-7B-Instruct
+litellm_logging_level: info
+ray_debug: 0
+ray_local_mode: false
+
+actor:
+  launcher: ray
+  ray_num_cpus: null
+  cube_workers_num_cpus: 1.0
+  cube_eval_workers_fraction: 0.5
+  cube_workers: 128
+  llm_max_rollouts: 128
+  ray_worker_log_enabled: true
+  ray_worker_log_path: null
+  ray_worker_log_level: WARNING
+  ray_worker_litellm_log_level: CRITICAL
+
+llm:
+  parameters:
+    max_tokens: 16000
+    max_completion_tokens: 16000
+    temperature: 1.0
+
+test_llm:
+  parameters:
+    max_tokens: 16000
+    max_completion_tokens: 16000
+    temperature: 1.0
+    top_p: 0.95
+
+vllm_config:
+  vllm_kwargs:
+    max_model_len: 32000
+    served_model_name: Qwen2.5-7B-Instruct
+    enable-auto-tool-choice: ""
+    tool-call-parser: rl_tool
+    tool-parser-plugin: ${hydra:runtime.cwd}/pipelinerl/rl_tool_parser_plugin.py
+
+finetune:
+  seq_length: 32000
+  seq_parallel: 8
+  gradient_accumulation_passes: 1024
+  rl:
+    policy_loss: gspo
+    overlong_filtering: true
+
+preprocess:
+  input: actor
+  output: training_data
+  n_workers: 8
+  shared_memory_entry_size: 1000000000
+
+cube_params:
+  resource_guard:
+    actor_memory_gb: 1.25
+    memory_overhead_gb: 8.0
+    memory_usage_threshold: 0.90
+  seed: ${seed}
+  cubes:
+    - id: open_reasoner_zero_57k
+      split: train
+      dataset_name: open_reasoner_zero_57k
+      benchmark: &math_tool_benchmark
+        _target_: math_tool_use.benchmark.MathToolUseBenchmark
+        default_tool_config:
+          _target_: math_tool_use.tool.MathToolUseToolConfig
+          sandbox_endpoint: http://dns-24e3447c-506e-4b21-92df-156e18db5087-sandboxfusion
+      agent: &tir_agent
+        _target_: cube_harness.agents.tir.TirAgentConfig
+        llm_config:
+          _target_: cube_harness.llm.RoutedLLMConfig
+          model_name: ${vllm_config.vllm_kwargs.served_model_name}
+          tokenizer_name: ${model_path}
+          timeout: 3600.0
+          num_retries: 1
+          extra_body:
+            return_token_ids: true
+        system_prompt: |
+          You are a math-focused AI Agent. Solve problems by combining clear symbolic reasoning
+          with short, deterministic Python code.
+          Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+          Always present the final answer in LaTeX \boxed{}.
+          Do not express emotions or opinions about user questions.
+
+          Workflow:
+          1. Draft a brief plan in plain text.
+          2. Execute one run_python_code call to compute or verify the result.
+          3. Finalize by calling MathAnswer with the LaTeX-formatted answer.
+
+          Python execution policy (run_python_code):
+          - Use Python strictly for pure computation to verify and validate the final answer.
+          - No network, file system, OS or environment access.
+          - Keep snippets minimal and self-contained; print only the final result.
+
+          Validation:
+          - Cross-check results (alternative derivation, invariants, higher precision) before finalizing.
+          - If execution fails, propose the minimal fix and retry.
+          Always verify with run_python_code before invoking MathAnswer.
+        max_actions: 3
+    - id: open_reasoner_zero_extended_72k
+      split: train
+      dataset_name: open_reasoner_zero_extended_72k
+      benchmark: *math_tool_benchmark
+      agent: *tir_agent
+    - id: aime_2025
+      split: test
+      dataset_name: aime_2025
+      benchmark: *math_tool_benchmark
+      agent: *tir_agent
diff --git a/conf/tir.yaml b/conf/tir.yaml
@@ -41,10 +41,14 @@ rewards:
 environments:
   - key: math
     mode: remote
+    replicas_per_actor: ${world.env_replicas_per_actor}
     _target_: pipelinerl.domains.math.MathEnvironment
 environment_key: math
 dataset_loader: pipelinerl.domains.math.load_datasets
 
+world:
+  env_replicas_per_actor: 1
+
 train_dataset_names:
   - open_reasoner_zero_57k
   - open_reasoner_zero_extended_72k