Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ Prime Lab connects verifiers environments to evaluations, GEPA prompt optimizati
# Set up a Lab workspace
prime lab setup

# Create a Lab project and make it active in this workspace
prime project create "Alphabet Sort Baselines"
prime project current

# List trainable models, capacity, and token pricing
prime train models

Expand All @@ -163,6 +167,7 @@ prime train init

# Launch the run from the generated config
prime train rl.toml
prime train rl.toml --project <project-id>

# Inspect and manage Hosted Training runs
prime train list
Expand All @@ -171,6 +176,26 @@ prime train metrics <run-id>
prime train checkpoints <run-id>
```

Lab projects group related training runs, evaluations, and adapters. Use
`prime project use <project-id>` to switch the active workspace project, or
`prime project clear` to clear it. Existing runs and adapters support project
add/remove/clear; evaluations support assign/clear.

```bash
# Manage projects
prime project list
prime project show <project-id>
prime project update <project-id> --description "Baseline alphabet sort runs"

# Attach existing artifacts
prime project assign run <run-id> <project-id>
prime project remove run <run-id> <project-id>
prime project assign adapter <adapter-id> <project-id>
prime project remove adapter <adapter-id> # clear all adapter project memberships
prime project assign eval <eval-id> <project-id>
prime project remove eval <eval-id> # clear the evaluation project
```

### GPU Resources

```bash
Expand Down Expand Up @@ -211,6 +236,7 @@ prime eval push

# Push specific eval directory (verifiers format)
prime eval push outputs/evals/gsm8k--gpt-4/abc123
prime eval push outputs/evals/gsm8k--gpt-4/abc123 --project <project-id>

# Push a public evaluation (default is private)
prime eval push --public
Expand Down
26 changes: 25 additions & 1 deletion packages/prime-evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ eval_response = client.create_evaluation(
model_name="gpt-4o-mini",
dataset="gsm8k",
framework="verifiers",
project_id="project-id",
metadata={
"version": "1.0",
"num_examples": 10,
Expand Down Expand Up @@ -220,6 +221,29 @@ client.finalize_evaluation(eval_id, metrics=eval_data.get("metrics"))
print(f"Successfully pushed evaluation: {eval_id}")
```

## Project Attachment

Evaluations can be created inside a Lab project, moved to another project, or
cleared from their project. Evaluation assignment is set/clear; targeted removal
from one project is not supported for evaluations.

```python
eval_response = client.create_evaluation(
name="gsm8k-project-baseline",
environments=[{"id": "gsm8k"}],
model_name="gpt-4o-mini",
project_id="project-id",
)

eval_id = eval_response["evaluation_id"]

# Move the evaluation to another project
client.update_evaluation(eval_id, project_id="another-project-id")

# Clear the evaluation project
client.update_evaluation(eval_id, clear_project=True)
```

## API Reference

### EvalsClient
Expand All @@ -232,6 +256,7 @@ Main client for interacting with the Prime Evals API.
- `push_samples()` - Push evaluation samples
- `finalize_evaluation()` - Finalize an evaluation with final metrics
- `get_evaluation()` - Get evaluation details by ID
- `update_evaluation()` - Update evaluation details or assign/clear a project
- `list_evaluations()` - List evaluations with optional filters
- `get_samples()` - Get samples for an evaluation

Expand Down Expand Up @@ -276,4 +301,3 @@ except EvalsAPIError as e:
## License

MIT License - see LICENSE file for details

38 changes: 28 additions & 10 deletions packages/prime-evals/src/prime_evals/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def _lookup_environment_by_slug(self, owner_slug: str, name: str) -> str:
EvalsAPIError: If the environment does not exist (404)
"""
try:
response = self.client.get(f"/environmentshub/{owner_slug}/{name}/@latest")
details = response.get("data", response)
return details["id"]
lookup_data: Dict[str, Any] = {"name": name, "owner_slug": owner_slug}
response = self.client.post("/environmentshub/lookup", json=lookup_data)
return response["data"]["id"]
except APIError as e:
raise EvalsAPIError(
f"Environment '{owner_slug}/{name}' does not exist in the hub. "
Expand Down Expand Up @@ -159,6 +159,7 @@ def create_evaluation(
task_type: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[List[str]] = None,
project_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
metrics: Optional[Dict[str, Any]] = None,
is_public: Optional[bool] = None,
Expand Down Expand Up @@ -202,6 +203,7 @@ def create_evaluation(
"task_type": task_type,
"description": description,
"tags": tags or [],
"project_id": project_id,
"metadata": metadata,
"metrics": metrics,
}
Expand Down Expand Up @@ -365,6 +367,8 @@ def update_evaluation(
task_type: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[List[str]] = None,
project_id: Optional[str] = None,
clear_project: bool = False,
metadata: Optional[Dict[str, Any]] = None,
metrics: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
Expand All @@ -375,11 +379,16 @@ def update_evaluation(
"framework": framework,
"task_type": task_type,
"description": description,
"tags": tags if tags is not None else [],
"tags": tags,
"project_id": project_id,
"metadata": metadata,
"metrics": metrics,
}
payload = {k: v for k, v in payload.items() if v is not None or k in ["tags"]}
payload = {
k: v
for k, v in payload.items()
if v is not None or (clear_project and k == "project_id")
}

response = self.client.request("PUT", f"/evaluations/{evaluation_id}", json=payload)
return response
Expand Down Expand Up @@ -424,9 +433,9 @@ async def _lookup_environment_by_slug(self, owner_slug: str, name: str) -> str:
EvalsAPIError: If the environment does not exist (404)
"""
try:
response = await self.client.get(f"/environmentshub/{owner_slug}/{name}/@latest")
details = response.get("data", response)
return details["id"]
lookup_data: Dict[str, Any] = {"name": name, "owner_slug": owner_slug}
response = await self.client.post("/environmentshub/lookup", json=lookup_data)
return response["data"]["id"]
except APIError as e:
raise EvalsAPIError(
f"Environment '{owner_slug}/{name}' does not exist in the hub. "
Expand Down Expand Up @@ -515,6 +524,7 @@ async def create_evaluation(
task_type: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[List[str]] = None,
project_id: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
metrics: Optional[Dict[str, Any]] = None,
is_public: Optional[bool] = None,
Expand Down Expand Up @@ -558,6 +568,7 @@ async def create_evaluation(
"task_type": task_type,
"description": description,
"tags": tags or [],
"project_id": project_id,
"metadata": metadata,
"metrics": metrics,
}
Expand Down Expand Up @@ -715,6 +726,8 @@ async def update_evaluation(
task_type: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[List[str]] = None,
project_id: Optional[str] = None,
clear_project: bool = False,
metadata: Optional[Dict[str, Any]] = None,
metrics: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
Expand All @@ -725,11 +738,16 @@ async def update_evaluation(
"framework": framework,
"task_type": task_type,
"description": description,
"tags": tags if tags is not None else [],
"tags": tags,
"project_id": project_id,
"metadata": metadata,
"metrics": metrics,
}
payload = {k: v for k, v in payload.items() if v is not None or k in ["tags"]}
payload = {
k: v
for k, v in payload.items()
if v is not None or (clear_project and k == "project_id")
}

response = await self.client.request("PUT", f"/evaluations/{evaluation_id}", json=payload)
return response
Expand Down
2 changes: 2 additions & 0 deletions packages/prime-evals/src/prime_evals/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Evaluation(BaseModel):
run_id: Optional[str] = Field(None, alias="runId")
version_id: Optional[str] = Field(None, alias="versionId")
tags: List[str] = Field(default_factory=list)
project_id: Optional[str] = Field(None, alias="projectId")
metadata: Optional[Dict[str, Any]] = None
metrics: Optional[Dict[str, Any]] = None
total_samples: Optional[int] = Field(None, alias="totalSamples")
Expand Down Expand Up @@ -66,6 +67,7 @@ class CreateEvaluationRequest(BaseModel):
task_type: Optional[str] = None
description: Optional[str] = None
tags: List[str] = Field(default_factory=list)
project_id: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
metrics: Optional[Dict[str, Any]] = None

Expand Down
134 changes: 95 additions & 39 deletions packages/prime-evals/tests/test_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,45 +125,6 @@ def test_sample_model_with_metadata():
assert sample.info == {"batch": 1}


def test_lookup_environment_by_slug_uses_owner_aware_detail_endpoint():
calls = []

class FakeAPIClient:
def get(self, endpoint):
calls.append(("get", endpoint))
return {"data": {"id": "env-123"}}

def post(self, *_args, **_kwargs):
raise AssertionError("owner/name lookup must not use team_slug lookup")

client = EvalsClient(FakeAPIClient())

env_id = client._lookup_environment_by_slug("d42me", "opencode-cp")

assert env_id == "env-123"
assert calls == [("get", "/environmentshub/d42me/opencode-cp/@latest")]


def test_async_lookup_environment_by_slug_uses_owner_aware_detail_endpoint():
calls = []

class FakeAsyncAPIClient:
async def get(self, endpoint):
calls.append(("get", endpoint))
return {"data": {"id": "env-123"}}

async def post(self, *_args, **_kwargs):
raise AssertionError("owner/name lookup must not use team_slug lookup")

client = AsyncEvalsClient.__new__(AsyncEvalsClient)
client.client = FakeAsyncAPIClient()

env_id = asyncio.run(client._lookup_environment_by_slug("d42me", "opencode-cp"))

assert env_id == "env-123"
assert calls == [("get", "/environmentshub/d42me/opencode-cp/@latest")]


def test_push_samples_reports_progress_and_reuses_http_client(monkeypatch):
posts = []
created_clients = []
Expand Down Expand Up @@ -271,6 +232,101 @@ def test_evals_client_context_manager():
pass # Expected to fail without proper initialization


def test_lookup_environment_by_slug_uses_owner_slug_payload():
captured = {}

class DummyHTTPClient:
def post(self, endpoint, json=None):
captured["endpoint"] = endpoint
captured["json"] = json
return {"data": {"id": "env-123"}}

client = EvalsClient.__new__(EvalsClient)
client.client = DummyHTTPClient()

assert client._lookup_environment_by_slug("alice", "gsm8k") == "env-123"
assert captured == {
"endpoint": "/environmentshub/lookup",
"json": {"name": "gsm8k", "owner_slug": "alice"},
}


def test_async_lookup_environment_by_slug_uses_owner_slug_payload():
captured = {}

class DummyHTTPClient:
async def post(self, endpoint, json=None):
captured["endpoint"] = endpoint
captured["json"] = json
return {"data": {"id": "env-123"}}

client = AsyncEvalsClient.__new__(AsyncEvalsClient)
client.client = DummyHTTPClient()

result = asyncio.run(client._lookup_environment_by_slug("alice", "gsm8k"))

assert result == "env-123"
assert captured == {
"endpoint": "/environmentshub/lookup",
"json": {"name": "gsm8k", "owner_slug": "alice"},
}


def test_create_evaluation_sends_project_id_payload():
captured = {}

class DummyConfig:
team_id = None

class DummyHTTPClient:
config = DummyConfig()

def request(self, method, endpoint, json=None, params=None):
captured["method"] = method
captured["endpoint"] = endpoint
captured["json"] = json
captured["params"] = params
return {"evaluation_id": "eval-123"}

client = EvalsClient.__new__(EvalsClient)
client.client = DummyHTTPClient()

response = client.create_evaluation(
name="gsm8k",
run_id="run-123",
model_name="gpt-4o-mini",
project_id="project-123",
)

assert response == {"evaluation_id": "eval-123"}
assert captured["method"] == "POST"
assert captured["endpoint"] == "/evaluations/"
assert captured["json"]["project_id"] == "project-123"
assert "projectId" not in captured["json"]


def test_update_evaluation_clear_project_sends_null_project_id():
captured = {}

class DummyHTTPClient:
def request(self, method, endpoint, json=None, params=None):
captured["method"] = method
captured["endpoint"] = endpoint
captured["json"] = json
captured["params"] = params
return {"evaluation_id": "eval-123"}

client = EvalsClient.__new__(EvalsClient)
client.client = DummyHTTPClient()

response = client.update_evaluation("eval-123", clear_project=True)

assert response == {"evaluation_id": "eval-123"}
assert captured["method"] == "PUT"
assert captured["endpoint"] == "/evaluations/eval-123"
assert captured["json"] == {"project_id": None}


def test_evaluation_model_minimal():
"""Test Evaluation model with minimal data"""
data = {
Expand Down
Loading
Loading