diff --git a/CHANGELOG.md b/CHANGELOG.md index d20160e..46b2887 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,22 @@ All notable changes to AlphaPulse are documented here. --- +## [Unreleased] — WandB XAI & Plot Quality Overhaul + +- **Universal feature importance:** `compute_universal_feature_importance` extracts and normalizes importance from any supported model type (XGBoost pred_contribs, LightGBM gain, CatBoost PredictionValuesChange, sklearn `feature_importances_`), averages across all models present, and logs a ranked bar chart to WandB. +- **Era-stratified importance:** `_log_era_stratified_importance` slices validation data by era, computes importance per slice, and logs a `line_series` chart showing each feature's importance trajectory over eras — directly reveals temporal stability. +- **Per-era stability report wired:** `compute_feature_report` (LightGBM proxy) now surfaces in WandB via `_log_feature_report`; logs top features by mean importance, top by era stability, and worst by era stability — each with bar charts. +- **Best-trial diagnostics run:** After HPO, the best config is retrained on an 80/20 era split and all expensive diagnostics (`log_era_importance=True`, top-50 importance artifact) are logged to a dedicated `best-trial-diagnostics` WandB run. +- **Prediction histogram fixed:** `_log_prediction_diagnostics` now uses `np.histogram(bins=50)` (50 rows) instead of logging every prediction row (50k+ rows). +- **Per-era line charts fixed:** `era_index` (0, 1, 2…) used as x-axis — fixes alphabetical string sort that scrambled chronological order. +- **Drawdown curve added:** Per-era drawdown from peak cumulative correlation logged alongside the cumulative correlation line chart. +- **Correlation distribution histogram:** Distribution of per-era Spearman correlations logged as a bar chart — directly answers "how many negative eras does this model have?" +- **Missing bar charts added:** Feature exposure top-15, ensemble model-pair correlation (A→B format), worst stability by era — all now have companion bar charts. +- **HPO summary table expanded:** 18 → 30 columns; adds `model_1/2/3_type` (split, for WandB parallel coordinates), XGBoost/LightGBM hyperparams, feature selection, noise injection, augmentation flags. +- **Convergence chart:** `log_hpo_convergence` logs all trial scores and running-best `corr_sharpe` in a single WandB run after the HPO search completes, rendering as a proper convergence curve. +- **String metric bug fixed:** `feature_importance_model_type` moved from `wandb.log()` (coerced to NaN) to `wandb.run.summary`. +- **Duplicate metric removed:** `metric/corr_sharpe` deduplicated in `log_hpo_trial_metrics`. + ## [0.5.0] — Production Hardening - **HPO fault tolerance:** Each local trial runs in an isolated subprocess; a crash marks that trial failed and the sweep continues. A SQLite-backed `TrialDB` (`src/alphapulse/hpo/trial_db.py`) persists trial state across crashes. `--resume` flag skips already-completed trial numbers. `--trial-timeout` caps each subprocess. diff --git a/README.md b/README.md index ba5d946..7917503 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,22 @@ AlphaPulse is a config-driven framework for building, training, and deploying ML pipelines for the [Numerai](https://numer.ai) stock-market prediction tournament. It covers the full workflow: dataset download, experiment definition, backtesting, hyperparameter optimization (HPO), and automated weekly submission. +## Architecture + +![AlphaPulse Architecture](docs/assets/architecture.drawio.png) + +The framework is organized into five layers: + +| Layer | Components | Purpose | +|---|---|---| +| **Data** | `NumeraiDataLoader`, parquet files, `features.json` | Downloads and loads Numerai dataset splits (train/validation/live) | +| **Configuration** | `ExperimentV1` YAML schema, HPO search space, `TrialDB`, AutoResearch agent | Defines what to train — via static YAML, automated HPO, or Claude-agent-driven research | +| **Core Pipeline** | Preprocessors, Models, `Pipeline` / `MultiHeadPipeline`, Ensemble, `FeatureNeutralizer` | Fits and combines models; handles feature routing, ensembling, and prediction neutralization | +| **Evaluation** | `Backtester`, `PurgedEraCV`, SHAP report, W&B diagnostics | Computes era-aware metrics (CORR, Sharpe, MMC) and XAI reports | +| **Export & Submission** | `predict.pkl`, live inference, submission validation, Numerai upload | Produces tournament-ready predictions and submits them | + +> The diagram is editable — open `docs/assets/architecture.drawio` in [draw.io](https://app.diagrams.net) to modify it. + ----- ## Table of Contents @@ -347,7 +363,7 @@ evaluation: ### Advanced Features * **Feature Groups:** Define `features.groups` as a mapping of `group_name -> [columns]`. You can then assign specific models to specific groups using `models[].input_group: group_name`. - * **Available Preprocessors:** `StandardScaler`, `RobustScaler`, `PCA`, `TruncatedSVD`, `GaussianNoise`, `VarianceSelector`, `LGBMImportanceSelector`, `Packboost`, and `GroupedPreprocessor`. + * **Available Preprocessors:** `StandardScaler`, `RobustScaler`, `PCA`, `TruncatedSVD`, `AutoencoderPreprocessor`, `CompressionPreprocessor`, `GaussianNoise`, `VarianceSelector`, `LGBMImportanceSelector`, `EraStableFeatureSelector`, `Packboost`, and `GroupedPreprocessor`. * **Available Models:** - **Gradient Boosting:** `XGBoost`, `LightGBM`, `CatBoost`, `Packboost` - **Tree Ensembles:** `RandomForest`, `ExtraTrees` @@ -482,31 +498,35 @@ make eda-lint ├── data/ # Downloaded Numerai parquet files ├── experiments/ # YAML configuration files ├── scripts/ # Executable workflow scripts -│ ├── download_dataset.py -│ ├── run_experiment.py -│ ├── hpo_pipeline.py -│ ├── run_test_pipeline.py -│ ├── export_numerai_pickle.py -│ ├── export_from_yaml.py -│ ├── live_inference.py -│ ├── submit_predictions.py -│ ├── make_feature_groups.py -│ └── autoresearch.py +│ ├── download_dataset.py # Download Numerai dataset +│ ├── run_experiment.py # Run a YAML-defined experiment (+ W&B logging) +│ ├── hpo_pipeline.py # Automated hyperparameter search (Ray Tune) +│ ├── run_test_pipeline.py # Lightweight smoke test +│ ├── export_numerai_pickle.py # Export predict.pkl from HPO result +│ ├── export_from_yaml.py # Export predict.pkl from YAML experiment +│ ├── live_inference.py # Run trained model on live tournament data +│ ├── submit_predictions.py # Upload predictions to Numerai +│ ├── make_feature_groups.py # Generate feature group definitions +│ ├── gpu_smoke_test.py # Verify GPU availability for deep models +│ ├── autoresearch.py # Claude-agent-driven research loop +│ └── wandb_sweep_config.yaml # W&B sweep configuration ├── eda/ # Standalone Streamlit EDA dashboard │ ├── app.py # Main entry point (streamlit run eda/app.py) -│ ├── pages/ # Multi-page analysis modules -│ └── utils/ # Config & data loading (uses NumeraiDataLoader) +│ ├── pages/ # Multi-page analysis modules (8 pages) +│ └── utils/ # Config, data loading, translations (EN/PL) +├── docs/assets/ # Diagrams and documentation assets +│ └── architecture.drawio.png # Architecture diagram (editable in draw.io) ├── src/alphapulse/ # Core framework source code -│ ├── autoresearch/ # Agent-driven research loop -│ ├── evaluation/ # Backtesting, metrics, diagnostics, export validation -│ ├── experiments/ # YAML schema, runner, data loading -│ ├── hpo/ # HPO objective, search space, builder, registry +│ ├── autoresearch/ # Agent-driven research loop (loop, agent, mutations, state) +│ ├── evaluation/ # Backtesting, metrics, SHAP report, W&B diagnostics, submission validation +│ ├── experiments/ # YAML schema (ExperimentV1), runner +│ ├── hpo/ # HPO objective, search space, builder, registry, TrialDB (SQLite) │ ├── logging_/ # Leaderboard and W&B helpers │ ├── models/ # All model implementations + factory -│ ├── pipeline/ # Pipeline, ensemble, neutralizer, stacker -│ ├── preprocessors/ # All preprocessor implementations + factory -│ ├── utils/ # Seed utility (set_global_seed) -│ └── validation/ # Purged era cross-validation +│ ├── pipeline/ # Pipeline, MultiHeadPipeline, MultiTargetPipeline, ensemble, neutralizer, stacker +│ ├── preprocessors/ # All preprocessor implementations + factory (incl. autoencoder, compression, era-stable selector) +│ ├── utils/ # Global seed utility +│ └── validation/ # PurgedEraCV └── tests/ # Unit tests ``` @@ -529,13 +549,17 @@ Commit messages: prefer conventional commits (e.g. `feat: ...`, `fix: ...`, `doc See [CHANGELOG.md](CHANGELOG.md) for completed releases. -**Completed — v0.5.0 (Production Hardening):** +**Completed — v0.5.0 (Production Hardening + XAI):** - **HPO fault tolerance:** Each local trial runs in an isolated subprocess; crashes mark the trial failed and the sweep continues. A SQLite-backed `TrialDB` persists trial state. `--resume` skips already-completed trials. - **Provenance artifact:** On every export, a hermetically sealed bundle is written: resolved config, `uv export` dependency snapshot, and git commit hash. - **Canonical artifact naming:** Exported models follow `___.pkl` with a `latest_predict.pkl` symlink. - **Masked loss for auxiliary targets:** `MultiTargetPipeline` drops NaN rows per-target; targets with fewer than 10 valid rows are skipped entirely. -- **Feature neutralization in eval loop:** `Backtester` and `EraSplitEvaluator` accept an optional `FeatureNeutralizer`; predictions are neutralized before metric computation. +- **Feature neutralization in eval loop:** `Backtester` accepts an optional `FeatureNeutralizer`; predictions are neutralized before metric computation. - **W&B experiment runner integration:** `scripts/run_experiment.py` logs configs, per-era metrics, and artifact paths to W&B via `--wandb-project`. +- **XAI / SHAP reporting:** `shap_report.py` computes per-era feature importance; `wandb_diagnostics.py` pushes rich HPO and XAI plots to W&B. +- **GPU HPO + foundation models:** `TabPFN3`, `TabICL`, and `TabularDL` (ft_transformer / mlp) with GPU-accelerated HPO via Ray Tune. +- **Universal feature importance + era stability:** `EraStableFeatureSelector` ranks features by blended mean importance / cross-era stability; `feature_report.py` surfaces per-era diagnostics. +- **Autoencoder + compression preprocessors:** `AutoencoderPreprocessor` and `CompressionPreprocessor` for learned low-dimensional representations. ----- diff --git a/docs/assets/architecture.drawio b/docs/assets/architecture.drawio new file mode 100644 index 0000000..6cd37e8 --- /dev/null +++ b/docs/assets/architecture.drawio @@ -0,0 +1,373 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/assets/architecture.drawio.png b/docs/assets/architecture.drawio.png new file mode 100644 index 0000000..aed99da Binary files /dev/null and b/docs/assets/architecture.drawio.png differ diff --git a/scripts/hpo_pipeline.py b/scripts/hpo_pipeline.py index c1b7350..31e256b 100644 --- a/scripts/hpo_pipeline.py +++ b/scripts/hpo_pipeline.py @@ -184,6 +184,126 @@ def _best_from_db(db: TrialDB, objective: str) -> tuple[float, dict]: return best_score, best_config +def _run_best_trial_diagnostics( + *, + best_config: dict, + data_dir: Path, + train_subsample: float, + target_col: str, + seed: int, + wandb_project: str, + wandb_group: str, + feature_cols: list[str], +) -> None: + """Retrain the best config and log a comprehensive XAI diagnostic WandB run. + + Splits the training data into an 80/20 era train/holdout split, retrains + the best pipeline on the train portion, and logs universal feature importance, + the per-era stability report, and era-stratified importance from the actual + trained models. Results are logged as a dedicated 'best-trial-diagnostics' run + within the same WandB group as the HPO trials. + + Args: + best_config: Flat config dict of the best HPO trial. + data_dir: Path to the data directory. + train_subsample: Fraction of training data used (same as HPO). + target_col: Target column name. + seed: Random seed. + wandb_project: WandB project name. + wandb_group: WandB group name (same as HPO run group). + feature_cols: Feature column names. + """ + import wandb + + from alphapulse.evaluation.backtester import Backtester + from alphapulse.evaluation.shap_report import compute_universal_feature_importance + from alphapulse.evaluation.wandb_diagnostics import log_experiment_diagnostics + from alphapulse.hpo.objective import _fit_pipeline + from alphapulse.hpo.search_space import ( + get_train_kwargs_from_flat, + resolve_flat_config, + ) + from alphapulse.logging_.wandb_utils import log_importance_artifact + + try: + X_train, y_train, _ = load_train_only_frame( + data_dir, + train_subsample=train_subsample, + target_col=target_col, + seed=seed, + feature_columns=None, + need_era=True, + ) + era_train = X_train["era"] + + eras_sorted = sorted(era_train.unique(), key=str) + n_holdout = max(5, len(eras_sorted) // 5) + holdout_set = set(eras_sorted[-n_holdout:]) + train_mask = ~era_train.isin(holdout_set) + + pipeline_cfg = resolve_flat_config(best_config) + if best_config.get("use_gpu"): + from alphapulse.hpo.search_space import apply_gpu_pipeline_config + + pipeline_cfg = apply_gpu_pipeline_config(pipeline_cfg) + train_kwargs = get_train_kwargs_from_flat(best_config) + + pipeline = _fit_pipeline( + pipeline_cfg, + feature_cols, + X_train.loc[train_mask], + y_train.loc[train_mask], + train_kwargs, + flat_config=best_config, + seed=seed, + ) + + ho_mask = era_train.isin(holdout_set) + X_ho = X_train.loc[ho_mask] + y_ho = y_train.loc[ho_mask] + era_ho = era_train.loc[ho_mask] + + X_feat = X_ho[feature_cols] + metrics = Backtester(pipeline, feature_columns=feature_cols).evaluate( + X_ho, y_ho, era_ho + ) + + wandb.init( + project=wandb_project, + group=wandb_group, + name="best-trial-diagnostics", + job_type="diagnostics", + config=best_config, + reinit=True, + ) + + log_experiment_diagnostics( + pipeline=pipeline, + X_val=X_ho, + y_val=y_ho, + era_val=era_ho, + feature_cols=feature_cols, + metrics=metrics, + log_shap=True, + log_feature_report=True, + log_era_importance=True, + ) + + importance, _ = compute_universal_feature_importance( + pipeline, X_feat, feature_cols=feature_cols, top_n=50 + ) + if importance: + log_importance_artifact(importance, name="best-trial-feature-importance") + + wandb.finish(quiet=True) + except Exception as exc: + logger.warning("Best-trial diagnostics failed: {}", exc) + try: + wandb.finish(quiet=True) + except Exception as finish_exc: + logger.debug("wandb.finish cleanup error: {}", finish_exc) + + def _run_local( *, data_dir: Path, @@ -442,11 +562,28 @@ def _run_local( logger.info("Leaderboard saved to: {}", output_dir / "leaderboard.json") if wandb_project and wandb_group: - from alphapulse.logging_.wandb_utils import log_hpo_summary_table + from alphapulse.logging_.wandb_utils import ( + log_hpo_convergence, + log_hpo_summary_table, + ) log_hpo_summary_table(results, project=wandb_project, group=wandb_group) + log_hpo_convergence(results, project=wandb_project, group=wandb_group) logger.info("WandB summary table logged to project={}", wandb_project) + if wandb_project and wandb_group and best_config: + logger.info("Running best-trial XAI diagnostics in WandB...") + _run_best_trial_diagnostics( + best_config=best_config, + data_dir=data_dir, + train_subsample=train_subsample, + target_col=target_col, + seed=seed, + wandb_project=wandb_project, + wandb_group=wandb_group, + feature_cols=feature_cols, + ) + def _run_ray( *, diff --git a/scripts/run_experiment.py b/scripts/run_experiment.py index 7f5f443..caf28e9 100644 --- a/scripts/run_experiment.py +++ b/scripts/run_experiment.py @@ -9,6 +9,74 @@ from alphapulse.utils import set_global_seed +def _build_wandb_config( + exp: Any, *, config_path: str, seed: int, gpu: bool +) -> dict[str, Any]: + models = exp.models + preprocessors = exp.preprocessing + + model_types = "+".join(m.type for m in models) + preprocessor_types = ( + "+".join(p.type for p in preprocessors) if preprocessors else "none" + ) + + cfg: dict[str, Any] = { + "config_path": config_path, + "seed": seed, + "gpu": gpu, + "target_col": exp.data.target_col, + "train_subsample": exp.data.train_subsample, + "feature_columns": "all" + if exp.features.columns is None + else len(exp.features.columns), + "n_feature_groups": len(exp.features.groups), + "n_models": len(models), + "model_types": model_types, + "n_preprocessors": len(preprocessors), + "preprocessor_types": preprocessor_types, + "ensemble_method": exp.ensemble_method, + "n_rounds": exp.train.n_rounds, + "early_stopping_rounds": exp.train.early_stopping_rounds, + "neutralization_proportion": exp.neutralization.proportion, + "primary_metric": exp.evaluation.primary_metric, + } + + cfg["is_multihead"] = any( + m.input_group is not None or m.input_columns is not None for m in models + ) + + for i, m in enumerate(models, start=1): + cfg[f"model_{i}_type"] = m.type + cfg[f"model_{i}_input_group"] = ( + m.input_group if m.input_group is not None else "all" + ) + if m.input_columns is not None: + cfg[f"model_{i}_input_columns_count"] = len(m.input_columns) + for k, v in m.params.items(): + if not isinstance(v, dict | list): + cfg[f"model_{i}_{k}"] = v + inner = m.params.get("params", {}) + for k, v in inner.items(): + if not isinstance(v, dict | list): + cfg[f"model_{i}_{k}"] = v + for j, lp in enumerate(m.preprocessors, start=1): + cfg[f"model_{i}_local_preprocessor_{j}_type"] = lp.type + for k, v in lp.params.items(): + if not isinstance(v, dict | list): + cfg[f"model_{i}_local_preprocessor_{j}_{k}"] = v + + for i, p in enumerate(preprocessors, start=1): + cfg[f"preprocessor_{i}_type"] = p.type + for k, v in p.params.items(): + if not isinstance(v, dict | list): + cfg[f"preprocessor_{i}_{k}"] = v + + for group_name, cols in exp.features.groups.items(): + cfg[f"feature_group_{group_name}_n_features"] = len(cols) + + return cfg + + def main( config: Path = Path("experiments/example_v1.yaml"), artifact_dir: Path | None = Path("artifacts/experiments"), @@ -27,15 +95,18 @@ def main( """ set_global_seed(seed) from alphapulse.experiments import run_experiment_from_path + from alphapulse.experiments.runner import load_experiment_dict + from alphapulse.experiments.schema import ExperimentV1 if wandb_project: from alphapulse.logging_.wandb_utils import init_wandb_run - init_wandb_run( - project=wandb_project, - name=config.stem, - config={"config_path": str(config), "seed": seed, "gpu": gpu}, + exp_dict = load_experiment_dict(config) + exp_parsed = ExperimentV1.model_validate(exp_dict) + wandb_cfg = _build_wandb_config( + exp_parsed, config_path=str(config), seed=seed, gpu=gpu ) + init_wandb_run(project=wandb_project, name=config.stem, config=wandb_cfg) result = run_experiment_from_path( config, diff --git a/src/alphapulse/evaluation/shap_report.py b/src/alphapulse/evaluation/shap_report.py index 96422b9..04de279 100644 --- a/src/alphapulse/evaluation/shap_report.py +++ b/src/alphapulse/evaluation/shap_report.py @@ -12,6 +12,8 @@ SHAP_SAMPLE_ROWS = 2000 +_ModelList = list[tuple[str, object]] + def _wandb_active() -> bool: try: @@ -22,6 +24,51 @@ def _wandb_active() -> bool: return False +def _collect_by_type( + pipeline: Pipeline | MultiHeadPipeline, + model_class: type, +) -> _ModelList: + results: _ModelList = [] + sources = ( + [h.model for h in pipeline.heads] + if isinstance(pipeline, MultiHeadPipeline) + else pipeline.models + ) + for m in sources: + if isinstance(m, model_class): + results.append((m.name, m)) + elif isinstance(m, EraEnsembleModel): + for sub in m._sub_models: + if isinstance(sub, model_class): + results.append((sub.name, sub)) + return results + + +def _collect_xgboost_models(pipeline: Pipeline | MultiHeadPipeline) -> _ModelList: + return _collect_by_type(pipeline, XGBoostModel) + + +def _collect_lgbm_models(pipeline: Pipeline | MultiHeadPipeline) -> _ModelList: + from ..models.lightgbm_model import LightGBMModel + + return _collect_by_type(pipeline, LightGBMModel) + + +def _collect_catboost_models(pipeline: Pipeline | MultiHeadPipeline) -> _ModelList: + from ..models.catboost_model import CatBoostModel + + return _collect_by_type(pipeline, CatBoostModel) + + +def _collect_sklearn_tree_models(pipeline: Pipeline | MultiHeadPipeline) -> _ModelList: + from ..models.sklearn_models import ExtraTreesModel, RandomForestModel + + results: _ModelList = [] + for cls in (RandomForestModel, ExtraTreesModel): + results.extend(_collect_by_type(pipeline, cls)) + return results + + def _xgb_contribs(model: XGBoostModel, X: pd.DataFrame) -> tuple[np.ndarray, list[str]]: feat = _numeric(X) dmat = xgb.DMatrix(feat) @@ -30,84 +77,189 @@ def _xgb_contribs(model: XGBoostModel, X: pd.DataFrame) -> tuple[np.ndarray, lis return np.asarray(contribs, dtype=np.float64), cols -def _collect_xgboost_models( - pipeline: Pipeline | MultiHeadPipeline, -) -> list[tuple[str, XGBoostModel]]: - models: list[tuple[str, XGBoostModel]] = [] - if isinstance(pipeline, MultiHeadPipeline): - for head in pipeline.heads: - m = head.model - if isinstance(m, XGBoostModel): - models.append((m.name, m)) - elif isinstance(m, EraEnsembleModel): - for sub in m._sub_models: - if isinstance(sub, XGBoostModel): - models.append((sub.name, sub)) - return models - - for model in pipeline.models: - if isinstance(model, XGBoostModel): - models.append((model.name, model)) - elif isinstance(model, EraEnsembleModel): - for sub in model._sub_models: - if isinstance(sub, XGBoostModel): - models.append((sub.name, sub)) - return models - - -def compute_xgboost_feature_importance( +def _lgbm_importance(model: object, feature_cols: list[str]) -> dict[str, float]: + """Gain-based LightGBM importance, normalized to [0, 1].""" + booster = getattr(model, "model", None) + if booster is None: + return {} + raw = booster.feature_importance(importance_type="gain") + names = booster.feature_name() + total = float(raw.sum()) + if total <= 0: + return {} + result: dict[str, float] = {} + for name, val in zip(names, raw, strict=False): + if name in feature_cols: + result[name] = float(val) / total + return result + + +def _catboost_importance(model: object, feature_cols: list[str]) -> dict[str, float]: + """CatBoost PredictionValuesChange importance, normalized to [0, 1].""" + cb = getattr(model, "model", None) + if cb is None: + return {} + raw = cb.get_feature_importance() + names = cb.feature_names_ + total = float(np.sum(raw)) + if total <= 0: + return {} + result: dict[str, float] = {} + for name, val in zip(names, raw, strict=False): + if name in feature_cols: + result[name] = float(val) / total + return result + + +def _sklearn_tree_importance( + model: object, feature_cols: list[str] +) -> dict[str, float]: + """sklearn tree feature_importances_, normalized to [0, 1].""" + estimator = getattr(model, "model", None) + if estimator is None: + return {} + raw = getattr(estimator, "feature_importances_", None) + if raw is None: + return {} + total = float(np.sum(raw)) + if total <= 0: + return {} + return {col: float(v) / total for col, v in zip(feature_cols, raw, strict=False)} + + +def _aggregate_importance( + per_model: list[dict[str, float]], +) -> dict[str, float]: + """Average normalized importance dicts across models.""" + if not per_model: + return {} + combined: dict[str, list[float]] = {} + for imp in per_model: + for feat, val in imp.items(): + combined.setdefault(feat, []).append(val) + return {feat: float(np.mean(vals)) for feat, vals in combined.items()} + + +def compute_universal_feature_importance( pipeline: Pipeline | MultiHeadPipeline, X: pd.DataFrame, *, + feature_cols: list[str], top_n: int = 20, max_rows: int = SHAP_SAMPLE_ROWS, -) -> dict[str, float]: +) -> tuple[dict[str, float], str]: + """Extract feature importance from any supported model type in the pipeline. + + Tries XGBoost (pred_contribs), LightGBM (gain), CatBoost, and sklearn tree + models. Normalizes each model's scores to [0, 1] and averages across models + of all types present. + + Args: + pipeline: Trained pipeline. + X: Feature DataFrame (pre-preprocessed, numeric). + feature_cols: Feature column names to report. + top_n: Maximum features to return. + max_rows: Row cap for XGBoost pred_contribs (expensive). + + Returns: + Tuple of (importance_dict sorted descending, model_type_label). + importance_dict is empty if no supported models are found. + """ + sample = X if len(X) <= max_rows else X.sample(n=max_rows, random_state=0) + + per_model_imps: list[dict[str, float]] = [] + type_labels: list[str] = [] + xgb_models = _collect_xgboost_models(pipeline) - if not xgb_models: - return {} + for _, m in xgb_models: + try: + contribs, cols = _xgb_contribs(m, sample) # type: ignore[arg-type] + mean_abs = np.mean(np.abs(contribs[:, :-1]), axis=0) + total = float(mean_abs.sum()) + if total > 0: + imp = { + c: float(v) / total + for c, v in zip(cols, mean_abs, strict=False) + if c in feature_cols + } + per_model_imps.append(imp) + type_labels.append("XGBoost") + except Exception: # noqa: BLE001, S112 + continue + + lgbm_models = _collect_lgbm_models(pipeline) + for _, m in lgbm_models: + imp = _lgbm_importance(m, feature_cols) + if imp: + per_model_imps.append(imp) + type_labels.append("LightGBM") + + cat_models = _collect_catboost_models(pipeline) + for _, m in cat_models: + imp = _catboost_importance(m, feature_cols) + if imp: + per_model_imps.append(imp) + type_labels.append("CatBoost") - sample = X - if len(X) > max_rows: - sample = X.sample(n=max_rows, random_state=0) + sklearn_models = _collect_sklearn_tree_models(pipeline) + for _, m in sklearn_models: + imp = _sklearn_tree_importance(m, feature_cols) + if imp: + per_model_imps.append(imp) + type_labels.append("SklearnTree") - aggregated: dict[str, list[float]] = {} - for _, model in xgb_models: - contribs, cols = _xgb_contribs(model, sample) - mean_abs = np.mean(np.abs(contribs[:, :-1]), axis=0) - for col, value in zip(cols, mean_abs, strict=False): - aggregated.setdefault(col, []).append(float(value)) + if not per_model_imps: + return {}, "none" - averaged = {col: float(np.mean(vals)) for col, vals in aggregated.items()} + averaged = _aggregate_importance(per_model_imps) ranked = sorted(averaged.items(), key=lambda kv: kv[1], reverse=True)[:top_n] - return dict(ranked) + label = "+".join(sorted(set(type_labels))) + return dict(ranked), label -def log_xgboost_shap_importance( +def log_universal_feature_importance( pipeline: Pipeline | MultiHeadPipeline, X: pd.DataFrame, *, + feature_cols: list[str], top_n: int = 20, -) -> None: +) -> dict[str, float]: + """Log universal feature importance for all supported model types to WandB. + + Args: + pipeline: Trained pipeline. + X: Feature DataFrame. + feature_cols: Feature column names. + top_n: Number of top features to log. + + Returns: + The importance dict (empty if WandB is not active or no models found). + """ if not _wandb_active(): - return + return {} import wandb - importance = compute_xgboost_feature_importance(pipeline, X, top_n=top_n) + importance, model_type = compute_universal_feature_importance( + pipeline, X, feature_cols=feature_cols, top_n=top_n + ) if not importance: - return + return {} table = wandb.Table(columns=["feature", "mean_abs_contribution"]) for feature, score in importance.items(): table.add_data(feature, score) wandb.log( { - "diagnostics/shap_top_features": table, - "diagnostics/shap_bar": wandb.plot.bar( + "diagnostics/feature_importance_top": table, + "diagnostics/feature_importance_bar": wandb.plot.bar( table, "feature", "mean_abs_contribution", - title="Top feature contributions (XGBoost pred_contribs)", + title=f"Top feature importance ({model_type})", ), } ) + if wandb.run is not None: + wandb.run.summary["diagnostics/feature_importance_model_type"] = model_type + return importance diff --git a/src/alphapulse/evaluation/wandb_diagnostics.py b/src/alphapulse/evaluation/wandb_diagnostics.py index 6d7c681..329506e 100644 --- a/src/alphapulse/evaluation/wandb_diagnostics.py +++ b/src/alphapulse/evaluation/wandb_diagnostics.py @@ -14,6 +14,7 @@ MAX_HEXBIN_POINTS = 10_000 FEATURE_EXPOSURE_TOP_N = 15 MAX_FNC_FEATURES = 200 +_ERA_IMPORTANCE_MIN_ROWS = 10 def _wandb_active() -> bool: @@ -96,8 +97,26 @@ def log_experiment_diagnostics( metrics: dict[str, float], meta_model_preds: np.ndarray | None = None, log_shap: bool = True, + log_feature_report: bool = True, + log_era_importance: bool = False, compute_fnc: bool | None = None, ) -> None: + """Log comprehensive XAI and backtest diagnostics to the active WandB run. + + Args: + pipeline: Trained pipeline. + X_val: Validation features (may include era column). + y_val: Validation targets. + era_val: Era labels aligned with X_val. + feature_cols: Feature column names (must not include "era"). + metrics: Backtest metrics dict. + meta_model_preds: Optional meta-model predictions for MMC logging. + log_shap: If True, log universal feature importance (all model types). + log_feature_report: If True, log per-era stability report via LightGBM proxy. + log_era_importance: If True, log era-stratified importance from pipeline models + (expensive — recommended only for best-trial diagnostics). + compute_fnc: Whether to log FNC. Auto-detected from feature count when None. + """ if not _wandb_active(): return @@ -133,9 +152,17 @@ def log_experiment_diagnostics( wandb.log({"diagnostics/fnc_sharpe": metrics["fnc_sharpe"]}) if log_shap: - from ..evaluation.shap_report import log_xgboost_shap_importance + from ..evaluation.shap_report import log_universal_feature_importance - log_xgboost_shap_importance(pipeline, X_use, top_n=20) + log_universal_feature_importance( + pipeline, X_use, feature_cols=feature_cols, top_n=20 + ) + + if log_feature_report: + _log_feature_report(X_use, y_val, era_val, feature_cols) + + if log_era_importance: + _log_era_stratified_importance(pipeline, X_use, feature_cols, era_val) def _log_per_era_correlation( @@ -148,39 +175,86 @@ def _log_per_era_correlation( return cumulative = per_era.cumsum() - table = wandb.Table(columns=["era", "correlation", "cumulative_correlation"]) - for era, corr in per_era.items(): - table.add_data(str(era), float(corr), float(cumulative.loc[era])) + cum_arr = cumulative.to_numpy(dtype=np.float64) + peak_arr = np.maximum.accumulate(cum_arr) + drawdown = pd.Series(peak_arr - cum_arr, index=per_era.index) + + table = wandb.Table( + columns=[ + "era_index", + "era", + "correlation", + "cumulative_correlation", + "drawdown", + ] + ) + for idx, (era, corr) in enumerate(per_era.items()): + table.add_data( + idx, + str(era), + float(corr), + float(cumulative.loc[era]), + float(drawdown.loc[era]), + ) wandb.log( { "diagnostics/per_era_correlation_table": table, "diagnostics/per_era_correlation": wandb.plot.line( - table, "era", "correlation", title="Per-era correlation" + table, "era_index", "correlation", title="Per-era Spearman correlation" ), "diagnostics/cumulative_correlation": wandb.plot.line( table, - "era", + "era_index", "cumulative_correlation", title="Cumulative per-era correlation", ), + "diagnostics/drawdown_curve": wandb.plot.line( + table, + "era_index", + "drawdown", + title="Drawdown from peak cumulative correlation", + ), } ) + valid_corrs = per_era.to_numpy(dtype=np.float64) + valid_corrs = valid_corrs[np.isfinite(valid_corrs)] + if len(valid_corrs) >= 5: + counts, edges = np.histogram(valid_corrs, bins=30, range=(-0.1, 0.1)) + mid = 0.5 * (edges[:-1] + edges[1:]) + dist_table = wandb.Table(columns=["bin_center", "count"]) + for m, c in zip(mid, counts, strict=False): + dist_table.add_data(float(m), int(c)) + wandb.log( + { + "diagnostics/corr_distribution": wandb.plot.bar( + dist_table, + "bin_center", + "count", + title="Distribution of per-era correlations", + ) + } + ) + def _log_prediction_diagnostics(y_val: pd.Series, preds: np.ndarray) -> None: import wandb ranked = rank_normalize(preds) - hist_table = wandb.Table(columns=["rank_normalized_prediction"]) - for value in ranked[np.isfinite(ranked)]: - hist_table.add_data(float(value)) + finite_ranked = ranked[np.isfinite(ranked)] + counts, edges = np.histogram(finite_ranked, bins=50) + midpoints = 0.5 * (edges[:-1] + edges[1:]) + hist_table = wandb.Table(columns=["bin_center", "count"]) + for mid, cnt in zip(midpoints, counts, strict=False): + hist_table.add_data(float(mid), int(cnt)) wandb.log( { - "diagnostics/prediction_histogram": wandb.plot.histogram( + "diagnostics/prediction_histogram": wandb.plot.bar( hist_table, - "rank_normalized_prediction", - title="Rank-normalized predictions", + "bin_center", + "count", + title="Rank-normalized prediction distribution (50 bins)", ) } ) @@ -237,7 +311,17 @@ def _log_feature_exposure( table = wandb.Table(columns=["feature", "mean_abs_corr"]) for row in summary["top"]: table.add_data(row["feature"], row["mean_abs_corr"]) - wandb.log({"diagnostics/feature_exposure_top": table}) + wandb.log( + { + "diagnostics/feature_exposure_top": table, + "diagnostics/feature_exposure_bar": wandb.plot.bar( + table, + "feature", + "mean_abs_corr", + title="Feature exposure (top 15 by mean |corr| with predictions)", + ), + } + ) def _log_ensemble_diagnostics( @@ -277,4 +361,202 @@ def _log_ensemble_diagnostics( for j, b in enumerate(names): if j >= i: table.add_data(a, b, corr[a][b]) - wandb.log({"diagnostics/ensemble_correlation_matrix": table}) + + pair_table = wandb.Table(columns=["pair", "correlation"]) + for i, a in enumerate(names): + for j, b in enumerate(names): + if j > i: + pair_table.add_data(f"{a}→{b}", corr[a][b]) + + logged: dict[str, Any] = {"diagnostics/ensemble_correlation_matrix": table} + if len(names) > 1: + logged["diagnostics/ensemble_correlation_bar"] = wandb.plot.bar( + pair_table, + "pair", + "correlation", + title="Model pair correlations (lower = more diverse ensemble)", + ) + wandb.log(logged) + + +def _log_feature_report( + X_val: pd.DataFrame, + y_val: pd.Series, + era_val: pd.Series, + feature_cols: list[str], + *, + top_n: int = 20, +) -> None: + """Log per-era feature stability report (LightGBM proxy) to WandB. + + Calls compute_feature_report and logs three tables: top features by mean + importance, top features by era stability, and worst features by stability. + Silently skips if lightgbm is not installed. + """ + if not _wandb_active(): + return + + import wandb + + try: + from ..evaluation.feature_report import compute_feature_report + except ImportError: + return + + try: + X_feat = X_val[feature_cols] if feature_cols else X_val + report = compute_feature_report(X_feat, y_val, era_val, top_n=top_n) + except Exception: + return + + wandb.log({"diagnostics/feature_n_eras_used": report["n_eras_used"]}) + + if report["top_by_mean"]: + table_mean = wandb.Table(columns=["feature", "mean_importance"]) + for row in report["top_by_mean"]: + table_mean.add_data(row["feature"], row["mean_importance"]) + wandb.log( + { + "diagnostics/feature_top_by_mean": table_mean, + "diagnostics/feature_importance_mean_bar": wandb.plot.bar( + table_mean, + "feature", + "mean_importance", + title="Top features by mean importance (LightGBM proxy, per era)", + ), + } + ) + + if report["top_by_stability"]: + table_stab = wandb.Table(columns=["feature", "stability", "mean_importance"]) + for row in report["top_by_stability"]: + table_stab.add_data( + row["feature"], row["stability"], row["mean_importance"] + ) + wandb.log( + { + "diagnostics/feature_top_by_stability": table_stab, + "diagnostics/feature_stability_bar": wandb.plot.bar( + table_stab, + "feature", + "stability", + title="Most stable features across eras (mean/std ratio)", + ), + } + ) + + if report["bottom_by_stability"]: + table_worst = wandb.Table(columns=["feature", "stability", "mean_importance"]) + for row in report["bottom_by_stability"]: + table_worst.add_data( + row["feature"], row["stability"], row["mean_importance"] + ) + wandb.log( + { + "diagnostics/feature_worst_stability": table_worst, + "diagnostics/feature_worst_stability_bar": wandb.plot.bar( + table_worst, + "feature", + "stability", + title="Least stable features across eras (worst to prune)", + ), + } + ) + + +def _log_era_stratified_importance( + pipeline: Pipeline | MultiHeadPipeline, + X_val: pd.DataFrame, + feature_cols: list[str], + era_val: pd.Series, + *, + top_n: int = 20, + max_eras: int = 30, +) -> None: + """Log era-stratified feature importance from the actual trained pipeline models. + + Samples up to max_eras eras, computes universal feature importance on each + era slice, then summarizes stability (mean/std ratio) and logs a heatmap table. + + Args: + pipeline: Trained pipeline. + X_val: Validation features (pre-selected to feature_cols). + feature_cols: Feature column names. + era_val: Era labels aligned with X_val. + top_n: Number of top features to include in the heatmap. + max_eras: Maximum eras to sample (keeps runtime bounded). + """ + if not _wandb_active(): + return + + import wandb + + from ..evaluation.shap_report import compute_universal_feature_importance + + e_arr = np.asarray(era_val.to_numpy()) + unique_eras = sorted(pd.unique(e_arr), key=str) + + if len(unique_eras) > max_eras: + rng = np.random.default_rng(42) + unique_eras = list(rng.choice(unique_eras, size=max_eras, replace=False)) + + era_imps: list[dict[str, float]] = [] + era_labels: list[str] = [] + + for era in unique_eras: + mask = e_arr == era + if mask.sum() < _ERA_IMPORTANCE_MIN_ROWS: + continue + X_era = X_val[mask] + imp, _ = compute_universal_feature_importance( + pipeline, X_era, feature_cols=feature_cols, top_n=top_n + ) + if imp: + era_imps.append(imp) + era_labels.append(str(era)) + + if not era_imps: + return + + all_features = sorted( + {f for imp in era_imps for f in imp}, + key=lambda f: -float(np.mean([imp.get(f, 0.0) for imp in era_imps])), + )[:top_n] + + imp_matrix = np.array([[imp.get(f, 0.0) for f in all_features] for imp in era_imps]) + mean_imp = imp_matrix.mean(axis=0) + std_imp = imp_matrix.std(axis=0, ddof=0) + stability = mean_imp / (std_imp + 1e-10) + + stab_table = wandb.Table( + columns=["feature", "mean_importance", "std_importance", "stability"] + ) + for feat, mean_v, std_v, stab_v in zip( + all_features, mean_imp, std_imp, stability, strict=False + ): + stab_table.add_data(feat, float(mean_v), float(std_v), float(stab_v)) + wandb.log( + { + "diagnostics/era_importance_stability": stab_table, + "diagnostics/era_importance_stability_bar": wandb.plot.bar( + stab_table, + "feature", + "stability", + title="Era-stratified importance stability (mean/std)", + ), + } + ) + + xs = list(range(len(era_labels))) + ys = [[float(imp.get(f, 0.0)) for imp in era_imps] for f in all_features] + wandb.log( + { + "diagnostics/era_importance_over_time": wandb.plot.line_series( + xs=xs, + ys=ys, + keys=all_features, + title="Feature importance across eras (each line = one feature)", + xname="era_index", + ), + } + ) diff --git a/src/alphapulse/logging_/wandb_utils.py b/src/alphapulse/logging_/wandb_utils.py index 58a4289..9956ac6 100644 --- a/src/alphapulse/logging_/wandb_utils.py +++ b/src/alphapulse/logging_/wandb_utils.py @@ -99,6 +99,9 @@ def log_hpo_summary_table( "max_drawdown", "pct_positive_eras", "model_types", + "model_1_type", + "model_2_type", + "model_3_type", "scaler_type", "use_packboost", "num_models", @@ -106,6 +109,15 @@ def log_hpo_summary_table( "ensemble_method", "use_neutralization", "neutralization_proportion", + "xgb_max_depth", + "xgb_learning_rate", + "lgbm_num_leaves", + "lgbm_learning_rate", + "lgbm_min_child_samples", + "use_noise_injection", + "feature_selection_type", + "use_feature_selection", + "use_augmentation", "elapsed_seconds", ] table = wandb.Table(columns=columns) @@ -129,6 +141,9 @@ def log_hpo_summary_table( r.metrics.get("max_drawdown"), r.metrics.get("pct_positive_eras"), model_types, + r.params.get("model_1_type"), + r.params.get("model_2_type"), + r.params.get("model_3_type"), r.params.get("scaler_type"), r.params.get("use_packboost"), r.params.get("num_models", 1), @@ -136,6 +151,17 @@ def log_hpo_summary_table( r.params.get("ensemble_method"), r.params.get("use_neutralization"), r.params.get("neutralization_proportion"), + r.params.get("model_1_max_depth") or r.params.get("xgb_max_depth"), + r.params.get("model_1_learning_rate") or r.params.get("xgb_learning_rate"), + r.params.get("model_1_num_leaves") or r.params.get("lgbm_num_leaves"), + r.params.get("model_1_learning_rate_lgbm") + or r.params.get("lgbm_learning_rate"), + r.params.get("model_1_min_child_samples") + or r.params.get("lgbm_min_child_samples"), + r.params.get("use_noise_injection"), + r.params.get("feature_selection_type"), + r.params.get("use_feature_selection"), + r.params.get("use_augmentation"), r.elapsed_seconds, ) @@ -174,11 +200,43 @@ def log_hpo_trial_metrics( logged["mmc_sharpe"] = result.mmc_sharpe if result.payout_score is not None: logged["payout_score"] = result.payout_score + top_level_keys = {"sharpe", "corr_sharpe", "mmc_sharpe", "payout_score"} for k, v in (result.metrics or {}).items(): - logged[f"metric/{k}"] = v + if k not in top_level_keys: + logged[f"metric/{k}"] = v wandb.log(logged) +def log_importance_artifact( + importance: dict[str, float], + *, + name: str = "feature-importance", +) -> None: + """Log a feature importance dict as a WandB CSV Artifact on the active run. + + Args: + importance: Mapping of feature name to importance score, sorted descending. + name: Artifact name (used as the WandB artifact identifier). + """ + import io + + import wandb + + if wandb.run is None: + return + + rows = sorted(importance.items(), key=lambda kv: kv[1], reverse=True) + buf = io.StringIO() + buf.write("feature,importance\n") + for feat, score in rows: + buf.write(f"{feat},{score}\n") + + artifact = wandb.Artifact(name=name, type="dataset") + with artifact.new_file("feature_importance.csv", mode="w") as f: + f.write(buf.getvalue()) + wandb.run.log_artifact(artifact) + + def log_hpo_trial( result: "TrialResult", flat_config: dict[str, Any], @@ -220,3 +278,49 @@ def log_hpo_trial( result, objective, model_types=model_types, preprocessors=preprocessors ) wandb.finish(quiet=True) + + +def log_hpo_convergence( + results: list[Any], + *, + project: str, + group: str, +) -> None: + """Log per-trial corr_sharpe and running best in a single WandB convergence run. + + All trials are logged as ordered steps within one run so that WandB renders + a proper convergence curve (trial scores + running maximum line). + + Args: + results: All TrialResult objects from the HPO search, in trial order. + project: WandB project name. + group: WandB group name (same as HPO run group). + """ + import wandb + + wandb.init( + project=project, + group=group, + name="search-convergence", + job_type="convergence", + reinit=True, + ) + best_so_far = float("-inf") + for r in results: + if r.error: + continue + trial_corr = ( + r.corr_sharpe + if r.corr_sharpe not in (float("-inf"), float("inf")) + else r.sharpe + ) + if trial_corr > best_so_far: + best_so_far = trial_corr + wandb.log( + { + "trial_corr_sharpe": trial_corr, + "best_corr_sharpe_so_far": best_so_far, + }, + step=r.trial_number, + ) + wandb.finish(quiet=True) diff --git a/tests/test_xai.py b/tests/test_xai.py new file mode 100644 index 0000000..c9bd1de --- /dev/null +++ b/tests/test_xai.py @@ -0,0 +1,202 @@ +"""Tests for universal feature importance extraction (XAI).""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from alphapulse.evaluation.shap_report import ( + compute_universal_feature_importance, + log_universal_feature_importance, +) +from alphapulse.models.sklearn_models import RandomForestModel +from alphapulse.models.xgboost_model import XGBoostModel +from alphapulse.pipeline.pipeline import Pipeline +from alphapulse.preprocessors.base import BasePreprocessor + +N_ROWS = 200 +N_FEATURES = 10 +FEATURE_COLS = [f"f{i}" for i in range(N_FEATURES)] + + +class _IdentityPreprocessor(BasePreprocessor): + def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> _IdentityPreprocessor: + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + return X + + +def _make_data() -> tuple[pd.DataFrame, pd.Series]: + rng = np.random.RandomState(42) + X = pd.DataFrame(rng.randn(N_ROWS, N_FEATURES), columns=FEATURE_COLS) + y = pd.Series(rng.randn(N_ROWS), name="target") + return X, y + + +def _xgb_pipeline(X: pd.DataFrame, y: pd.Series) -> Pipeline: + model = XGBoostModel() + pipe = Pipeline(preprocessors=[_IdentityPreprocessor()], model=model) + pipe.fit(X, y, n_rounds=10, early_stopping_rounds=5) + return pipe + + +def _lgbm_pipeline(X: pd.DataFrame, y: pd.Series) -> Pipeline: + pytest.importorskip("lightgbm") + from alphapulse.models.lightgbm_model import LightGBMModel + + model = LightGBMModel( + params={ + "objective": "regression", + "metric": "rmse", + "max_depth": 3, + "learning_rate": 0.1, + "num_leaves": 8, + "min_child_samples": 5, + "verbosity": -1, + "n_jobs": 1, + }, + n_estimators=20, + ) + pipe = Pipeline(preprocessors=[_IdentityPreprocessor()], model=model) + pipe.fit(X, y, n_rounds=20) + return pipe + + +def _catboost_pipeline(X: pd.DataFrame, y: pd.Series) -> Pipeline: + pytest.importorskip("catboost") + from alphapulse.models.catboost_model import CatBoostModel + + model = CatBoostModel(iterations=20) + pipe = Pipeline(preprocessors=[_IdentityPreprocessor()], model=model) + pipe.fit(X, y, n_rounds=20) + return pipe + + +def _rf_pipeline(X: pd.DataFrame, y: pd.Series) -> Pipeline: + model = RandomForestModel( + params={"n_estimators": 10, "n_jobs": 1, "random_state": 0} + ) + pipe = Pipeline(preprocessors=[_IdentityPreprocessor()], model=model) + pipe.fit(X, y) + return pipe + + +def _ridge_pipeline(X: pd.DataFrame, y: pd.Series) -> Pipeline: + from alphapulse.models.sklearn_models import RidgeModel + + model = RidgeModel() + pipe = Pipeline(preprocessors=[_IdentityPreprocessor()], model=model) + pipe.fit(X, y) + return pipe + + +class TestComputeUniversalFeatureImportance: + def test_xgboost_returns_nonempty_dict(self) -> None: + X, y = _make_data() + pipe = _xgb_pipeline(X, y) + imp, label = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS, top_n=5 + ) + assert len(imp) > 0 + assert len(imp) <= 5 + assert label == "XGBoost" + + def test_xgboost_scores_are_finite_and_nonneg(self) -> None: + X, y = _make_data() + pipe = _xgb_pipeline(X, y) + imp, _ = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS + ) + assert all(np.isfinite(v) and v >= 0 for v in imp.values()) + + def test_lgbm_returns_nonempty_dict(self) -> None: + X, y = _make_data() + pipe = _lgbm_pipeline(X, y) + imp, label = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS, top_n=5 + ) + assert len(imp) > 0 + assert "LightGBM" in label + + def test_catboost_returns_nonempty_dict(self) -> None: + X, y = _make_data() + pipe = _catboost_pipeline(X, y) + imp, label = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS, top_n=5 + ) + assert len(imp) > 0 + assert "CatBoost" in label + + def test_random_forest_returns_nonempty_dict(self) -> None: + X, y = _make_data() + pipe = _rf_pipeline(X, y) + imp, label = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS, top_n=5 + ) + assert len(imp) > 0 + assert "SklearnTree" in label + + def test_ridge_only_returns_empty(self) -> None: + X, y = _make_data() + pipe = _ridge_pipeline(X, y) + imp, label = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS + ) + assert imp == {} + assert label == "none" + + def test_top_n_limits_result_size(self) -> None: + X, y = _make_data() + pipe = _xgb_pipeline(X, y) + for top_n in (3, 7, N_FEATURES): + imp, _ = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS, top_n=top_n + ) + assert len(imp) <= top_n + + def test_result_sorted_descending(self) -> None: + X, y = _make_data() + pipe = _xgb_pipeline(X, y) + imp, _ = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS + ) + scores = list(imp.values()) + assert scores == sorted(scores, reverse=True) + + def test_keys_are_subset_of_feature_cols(self) -> None: + X, y = _make_data() + pipe = _xgb_pipeline(X, y) + imp, _ = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS + ) + assert set(imp.keys()).issubset(set(FEATURE_COLS)) + + def test_mixed_xgb_rf_pipeline_nonempty(self) -> None: + X, y = _make_data() + xgb_model = XGBoostModel(name="xgb1") + rf_model = RandomForestModel( + params={"n_estimators": 5, "n_jobs": 1, "random_state": 0}, name="rf1" + ) + pipe = Pipeline( + preprocessors=[_IdentityPreprocessor()], + models=[xgb_model, rf_model], + ensemble_method="weighted", + ensemble_params={"weights": [0.5, 0.5]}, + ) + pipe.fit(X, y, n_rounds=5) + imp, label = compute_universal_feature_importance( + pipe, X, feature_cols=FEATURE_COLS + ) + assert len(imp) > 0 + assert "XGBoost" in label + assert "SklearnTree" in label + + +class TestLogUniversalFeatureImportance: + def test_returns_empty_when_wandb_inactive(self) -> None: + X, y = _make_data() + pipe = _xgb_pipeline(X, y) + result = log_universal_feature_importance(pipe, X, feature_cols=FEATURE_COLS) + assert result == {}