Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 37 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
contents: read

jobs:
test:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
Expand All @@ -20,5 +20,40 @@
pixi-version: latest
environments: dev

- name: Run tests
- name: Run unit tests
run: pixi run -e dev test

integration-tests:
runs-on: ubuntu-latest
services:
minio:
image: bitnamilegacy/minio:latest

Check failure

Code scanning / zizmor

unpinned image references Error test

unpinned image references
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed
ports:
- 9000:9000
env:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
MINIO_DEFAULT_BUCKETS: josh-test-bucket:public
MINIO_SCHEME: http
options: >-
--health-cmd "curl -f http://localhost:9000/minio/health/ready || curl -f http://localhost:9000/minio/health/live"
--health-interval 10s
--health-timeout 5s
--health-retries 5

steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

Check notice

Code scanning / zizmor

credential persistence through GitHub Actions artifacts Note test

credential persistence through GitHub Actions artifacts
Comment thread
github-advanced-security[bot] marked this conversation as resolved.
Fixed

- uses: prefix-dev/setup-pixi@a0af7a228712d6121d37aba47adf55c1332c9c2e # v0.9.4
with:
pixi-version: latest
environments: dev

- name: Verify MinIO is ready
run: curl -f http://localhost:9000/minio/health/ready

- name: Download Josh JAR
run: pixi run get-jars

- name: Run integration tests
run: pixi run -e dev test-integration
344 changes: 344 additions & 0 deletions BATCH_INTEGRATION.md

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions joshpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
InspectExportsConfig,
ExportFileInfo,
ExportPaths,
StageFromMinioConfig,
)

# JFR diagnostics (always available, no external deps)
Expand Down Expand Up @@ -119,6 +120,7 @@
RunInfo,
SessionSummary,
DataSummary,
configure_s3,
)
from joshpy.cell_data import (
CellDataLoader,
Expand All @@ -136,6 +138,7 @@
SweepManagerBuilder,
recover_sweep_results,
load_job_results,
ingest_results,
LoadConfig,
ResultLoadError,
)
Expand Down Expand Up @@ -182,6 +185,7 @@
"InspectExportsConfig",
"ExportFileInfo",
"ExportPaths",
"StageFromMinioConfig",
# JFR diagnostics
"ResourceProfile",
"CpuProfile",
Expand Down Expand Up @@ -243,6 +247,7 @@
"RunInfo",
"SessionSummary",
"DataSummary",
"configure_s3",
"CellDataLoader",
"DiagnosticQueries",
"SimulationDiagnostics",
Expand All @@ -252,6 +257,7 @@
"SweepManagerBuilder",
"recover_sweep_results",
"load_job_results",
"ingest_results",
"LoadConfig",
"ResultLoadError",
"HAS_SWEEP",
Expand Down
20 changes: 13 additions & 7 deletions joshpy/cell_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def __init__(self, registry: Any):

def load_csv(
self,
csv_path: Path,
csv_path: "Path | str",
run_id: str,
run_hash: str,
entity_type: str = "patch",
Expand All @@ -166,10 +166,12 @@ def load_csv(
quoted identifiers (e.g., 'avg.height' stays as "avg.height"), requiring
double quotes when referenced with direct calls to DuckDB.

Uses DuckDB's native CSV reader for optimal performance.
Uses DuckDB's native CSV reader for optimal performance. Accepts both
local ``Path`` objects and ``s3://`` URL strings (requires httpfs to be
loaded on the connection -- see ``configure_s3()``).

Args:
csv_path: Path to the CSV file.
csv_path: Path to the CSV file, or an ``s3://`` URL string.
run_id: The run ID this data belongs to.
run_hash: Run hash for this run.
entity_type: Type of entity being exported (default: "patch").
Expand All @@ -178,14 +180,18 @@ def load_csv(
Number of rows loaded.

Raises:
FileNotFoundError: If csv_path doesn't exist.
FileNotFoundError: If csv_path is a local path that doesn't exist.
ValueError: If CSV is missing required columns or type mismatch.
"""
if not csv_path.exists():
raise FileNotFoundError(f"CSV not found: {csv_path}")
if isinstance(csv_path, str) and csv_path.startswith("s3://"):
csv_path_str = csv_path
else:
csv_path = Path(csv_path)
if not csv_path.exists():
raise FileNotFoundError(f"CSV not found: {csv_path}")
csv_path_str = str(csv_path.resolve())

conn = self.registry.conn
csv_path_str = str(csv_path.resolve())

# Read CSV header to identify columns using DuckDB
header_result = conn.execute(f"SELECT * FROM read_csv_auto('{csv_path_str}') LIMIT 0")
Expand Down
57 changes: 57 additions & 0 deletions joshpy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,32 @@ class InspectJshdConfig:
y: int


@dataclass(frozen=True)
class StageFromMinioConfig:
"""Arguments for 'java -jar joshsim.jar stageFromMinio' command.

Downloads all objects under a MinIO prefix to a local directory.
MinIO credentials are optional -- joshsim falls back to environment
variables (MINIO_ENDPOINT, MINIO_ACCESS_KEY, MINIO_SECRET_KEY,
MINIO_BUCKET) via its HierarchyConfig.

Attributes:
output_dir: Local directory to download files into.
prefix: MinIO object prefix to download from.
minio_endpoint: MinIO endpoint URL (optional).
minio_access_key: MinIO access key (optional).
minio_secret_key: MinIO secret key (optional).
minio_bucket: MinIO bucket name (optional).
"""

output_dir: Path
prefix: str
minio_endpoint: str | None = None
minio_access_key: str | None = None
minio_secret_key: str | None = None
minio_bucket: str | None = None


@dataclass(frozen=True)
class InspectExportsConfig:
"""Arguments for 'java -jar joshsim.jar inspect-exports' command.
Expand Down Expand Up @@ -624,6 +650,37 @@ def _execute(
command=cmd,
)

def stage_from_minio(
self,
config: StageFromMinioConfig,
timeout: float | None = None,
) -> CLIResult:
"""Download files from MinIO to a local directory.

Args:
config: Stage-from-MinIO configuration.
timeout: Timeout in seconds.

Returns:
CLIResult with execution details.
"""
args = [
"stageFromMinio",
"--output-dir", str(config.output_dir.resolve()),
"--prefix", config.prefix,
]

if config.minio_endpoint:
args.extend(["--minio-endpoint", config.minio_endpoint])
if config.minio_access_key:
args.extend(["--minio-access-key", config.minio_access_key])
if config.minio_secret_key:
args.extend(["--minio-secret-key", config.minio_secret_key])
if config.minio_bucket:
args.extend(["--minio-bucket", config.minio_bucket])

return self._execute(args, timeout=timeout)

def _execute_streaming(
self,
cmd: list[str],
Expand Down
42 changes: 42 additions & 0 deletions joshpy/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,48 @@ def _check_duckdb() -> None:
)


def configure_s3(
conn: Any,
endpoint: str,
access_key: str,
secret_key: str,
url_style: str = "path",
use_ssl: bool = True,
) -> None:
"""Configure a DuckDB connection for S3/MinIO access via httpfs.

Installs and loads the httpfs extension, then creates an S3 secret
so ``read_csv_auto('s3://bucket/key.csv')`` works transparently.

Credential resolution is the caller's responsibility -- this function
takes explicit values. ``ingest_results()`` resolves credentials from
environment variables (``MINIO_ENDPOINT``, ``MINIO_ACCESS_KEY``,
``MINIO_SECRET_KEY``) before calling here.

Args:
conn: DuckDB connection object.
endpoint: S3-compatible endpoint (e.g. ``"storage.googleapis.com"``).
access_key: Access key / key ID.
secret_key: Secret key.
url_style: ``"path"`` (default, MinIO) or ``"vhost"`` (AWS).
use_ssl: Use HTTPS (default True).
"""
conn.execute("INSTALL httpfs; LOAD httpfs;")
conn.execute(
"""
CREATE OR REPLACE SECRET (
TYPE s3,
KEY_ID ?,
SECRET ?,
ENDPOINT ?,
URL_STYLE ?,
USE_SSL ?
)
""",
[access_key, secret_key, endpoint, url_style, use_ssl],
)


def _get_git_hash() -> str | None:
"""Get current git HEAD hash, or None if not in a git repo."""
try:
Expand Down
Loading