diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst index d312b46a1..253410a5e 100644 --- a/docs/source/api_reference/regression.rst +++ b/docs/source/api_reference/regression.rst @@ -196,6 +196,21 @@ This section lists simple regressors which can be used as baselines. DeltaPointRegressor DummyProbaRegressor +.. currentmodule:: skpro.regression.unconditional_distfit + +.. autosummary:: + :toctree: auto_generated/ + :template: class.rst + + UnconditionalDistfitRegressor + +.. currentmodule:: skpro.regression.deterministic_reduction + +.. autosummary:: + :toctree: auto_generated/ + :template: class.rst + + DeterministicReductionRegressor Linear regression ----------------- diff --git a/examples/baseline_regressors_demo.py b/examples/baseline_regressors_demo.py new file mode 100644 index 000000000..50e64aefd --- /dev/null +++ b/examples/baseline_regressors_demo.py @@ -0,0 +1,27 @@ +"""Example usage for baseline probabilistic regressors.""" +import logging + +import numpy as np +from sklearn.linear_model import LinearRegression + +from skpro.regression.deterministic_reduction import DeterministicReductionRegressor +from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor + +# Generate synthetic data +X = np.random.randn(100, 3) +y = 2 * X[:, 0] + np.random.randn(100) + +# 1. Unconditional density baseline (featureless) +reg1 = UnconditionalDistfitRegressor() +reg1.fit(X, y) +dist1 = reg1.predict_proba(X) +logging.info("UnconditionalDistfitRegressor mean: %s", dist1.mean()) +logging.info("Sample from unconditional: %s", dist1.sample(5)) + +# 2. Deterministic-style baseline (mean from regressor, constant variance) +reg2 = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian") +reg2.fit(X, y) +dist2 = reg2.predict_proba(X) +logging.info("DeterministicReductionRegressor mean: %s", dist2.mean) +logging.info("DeterministicReductionRegressor sigma: %s", dist2.sigma) +logging.info("Sample from deterministic baseline: %s", dist2.sample(5)) diff --git a/examples/baseline_regressors_kde_hist.py b/examples/baseline_regressors_kde_hist.py new file mode 100644 index 000000000..e8d3a4efc --- /dev/null +++ b/examples/baseline_regressors_kde_hist.py @@ -0,0 +1,21 @@ +"""Example: unconditional norm and laplace baselines with distfit.""" +import logging + +import numpy as np + +from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor + +X = np.random.randn(80, 2) +y = np.random.randn(80) + +# Distfit norm baseline +reg_norm = UnconditionalDistfitRegressor(distr_type="norm") +reg_norm.fit(X, y) +dist_norm = reg_norm.predict_proba(X) +logging.info("Norm baseline mean: %s", dist_norm.mean()) + +# Distfit laplace baseline +reg_laplace = UnconditionalDistfitRegressor(distr_type="laplace") +reg_laplace.fit(X, y) +dist_laplace = reg_laplace.predict_proba(X) +logging.info("Laplace baseline mean: %s", dist_laplace.mean()) diff --git a/examples/benchmark_baseline_regressors.py b/examples/benchmark_baseline_regressors.py new file mode 100644 index 000000000..0566585b2 --- /dev/null +++ b/examples/benchmark_baseline_regressors.py @@ -0,0 +1,40 @@ +"""Benchmark script for baseline probabilistic regressors.""" +import logging + +import numpy as np +from sklearn.linear_model import LinearRegression + +from skpro.metrics import PinballLoss +from skpro.regression.deterministic_reduction import DeterministicReductionRegressor +from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor + +# Generate synthetic data +X = np.random.randn(200, 5) +y = 3 * X[:, 0] - 2 * X[:, 1] + np.random.randn(200) + +# Split +X_train, X_test = X[:150], X[150:] +y_train, y_test = y[:150], y[150:] + +# Baseline 1: Unconditional +reg1 = UnconditionalDistfitRegressor() +reg1.fit(X_train, y_train) +dist1 = reg1.predict_proba(X_test) + +# Baseline 2: Deterministic reduction +reg2 = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian") +reg2.fit(X_train, y_train) +dist2 = reg2.predict_proba(X_test) + +# Evaluate pinball loss at alpha=0.1, 0.5, 0.9 +alphas = [0.1, 0.5, 0.9] +for alpha in alphas: + loss1 = PinballLoss(alpha=alpha)(y_test, dist1) + loss2 = PinballLoss(alpha=alpha)(y_test, dist2) + logging.info( + "Alpha=%s: UnconditionalDistfitRegressor pinball loss=%.4f, " + "DeterministicReductionRegressor pinball loss=%.4f", + alpha, + loss1, + loss2, + ) diff --git a/skpro/distributions/tests/test_proba_basic.py b/skpro/distributions/tests/test_proba_basic.py index 3cb1bf26d..c9c66cea3 100644 --- a/skpro/distributions/tests/test_proba_basic.py +++ b/skpro/distributions/tests/test_proba_basic.py @@ -112,6 +112,9 @@ def test_proba_index_coercion(): @pytest.mark.parametrize("fun", ["pdf", "ppf", "cdf"]) def test_proba_plotting(fun): """Test that plotting functions do not crash and return ax as expected.""" + import matplotlib + + matplotlib.use("Agg", force=True) from matplotlib.axes import Axes from matplotlib.figure import Figure diff --git a/skpro/regression/__init__.py b/skpro/regression/__init__.py index 8ceb27150..f72940d32 100644 --- a/skpro/regression/__init__.py +++ b/skpro/regression/__init__.py @@ -5,13 +5,17 @@ MapieCrossConformalRegressor, MapieSplitConformalRegressor, ) +from skpro.regression.deterministic_reduction import DeterministicReductionRegressor from skpro.regression.jackknife import MapieJackknifeAfterBootstrapRegressor from skpro.regression.nonparametric import NadarayaWatsonCDE +from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor __all__ = [ + "DeterministicReductionRegressor", "MapieSplitConformalRegressor", "MapieCrossConformalRegressor", "MapieConformalizedQuantileRegressor", "MapieJackknifeAfterBootstrapRegressor", "NadarayaWatsonCDE", + "UnconditionalDistfitRegressor", ] diff --git a/skpro/regression/deterministic_reduction.py b/skpro/regression/deterministic_reduction.py new file mode 100644 index 000000000..5c8724335 --- /dev/null +++ b/skpro/regression/deterministic_reduction.py @@ -0,0 +1,150 @@ +"""Deterministic regression reduction baseline. + +Outputs Gaussian (or Laplace) with mean=prediction, var=training sample var. +""" + +import numpy as np + +from skpro.distributions.laplace import Laplace +from skpro.distributions.normal import Normal +from skpro.regression.base import BaseProbaRegressor + + +class DeterministicReductionRegressor(BaseProbaRegressor): + """ + Wraps a deterministic regressor to output a Gaussian or Laplace. + + The output has mean=prediction, var=training sample var. + Multi-output y is not supported (raises NotImplementedError). + + Examples + -------- + >>> from sklearn.linear_model import LinearRegression + >>> from skpro.regression.deterministic_reduction import ( + ... DeterministicReductionRegressor + ... ) + >>> import pandas as pd + >>> X = pd.DataFrame({"a": [1, 2, 3]}) + >>> y = pd.DataFrame([1, 2, 3]) + >>> reg = DeterministicReductionRegressor( + ... LinearRegression(), + ... distr_type="gaussian" + ... ) + >>> reg.fit(X, y) # doctest: +ELLIPSIS + DeterministicReductionRegressor(...) + >>> dist = reg.predict_proba(X) + >>> dist.mean() # doctest: +NORMALIZE_WHITESPACE + 0 + 0 1.0 + 1 2.0 + 2 3.0 + + References + ---------- + - Gaussian Processes for State Space Models and Change Point Detection + (Turner, 2011 thesis). https://mlg.eng.cam.ac.uk/pub/pdf/Tur11.pdf + - A Probabilistic View of Linear Regression + (Bishop, PRML; Keng, 2016; various tutorials). + - mlr3proba and related probabilistic ML frameworks. + - Efficient and Distance-Aware Deep Regressor for Uncertainty Quantification + (Bui et al., 2024). + https://proceedings.mlr.press/v238/manh-bui24a/manh-bui24a.pdf + """ + + _tags = { + "authors": ["arnavk23"], + "estimator_type": "regressor_proba", + # estimator tags + # -------------- + "capability:multioutput": False, + "capability:missing": True, + "X_inner_mtype": "pd_DataFrame_Table", + "y_inner_mtype": "pd_DataFrame_Table", + } + + def __init__(self, regressor, distr_type="gaussian"): + allowed_types = ["gaussian", "laplace"] + if distr_type not in allowed_types: + raise ValueError( + f"distr_type must be one of {allowed_types}, got {distr_type}" + ) + self.regressor = regressor + self.distr_type = distr_type + super().__init__() + + def _fit(self, X, y, C=None): + # Ensure X and y are DataFrames with string column names + import pandas as pd + from sklearn.base import clone + + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + X = X.copy() + X.columns = [str(col) for col in X.columns] + if not isinstance(y, pd.DataFrame): + y = pd.DataFrame(y) + y = y.copy() + y.columns = [str(col) for col in y.columns] + if y.shape[1] > 1: + raise NotImplementedError( + "DeterministicReductionRegressor only supports univariate y. " + f"Got shape: {y.shape}" + ) + self._X_cols = X.columns + self._y_cols = y.columns + self._X_index = X.index + self._y_index = y.index + # Clone the regressor to avoid mutating the parameter + self.regressor_ = clone(self.regressor) + self.regressor_ = self.regressor_.fit( + X, y.values.ravel() if y.shape[1] == 1 else y + ) + y_arr = y.values.flatten() + self.train_mean_ = np.mean(y_arr) + self.train_var_ = np.var(y_arr) + return self + + def _predict_proba(self, X): + import pandas as pd + + # Ensure X is a DataFrame with string column names + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X, columns=self._X_cols) + X = X.copy() + X.columns = [str(col) for col in X.columns] + mean_pred = self.regressor_.predict(X) + # Ensure output shape matches y + if mean_pred.ndim == 1: + mean_pred = mean_pred.reshape(-1, 1) + # Return distribution with correct index/columns + if self.distr_type == "gaussian": + return Normal( + mu=mean_pred, + sigma=np.sqrt(self.train_var_), + index=X.index, + columns=self._y_cols, + ) + if self.distr_type == "laplace": + # Laplace scale = sqrt(var/2) + return Laplace( + mu=mean_pred, + scale=np.sqrt(self.train_var_ / 2), + index=X.index, + columns=self._y_cols, + ) + raise ValueError(f"Unknown distr_type: {self.distr_type}") + + def get_params(self, deep=True): + """Get parameters for this estimator.""" + # Only return true hyperparameters, not fitted attributes + return {"regressor": self.regressor, "distr_type": self.distr_type} + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter sets for automated tests.""" + from sklearn.linear_model import LinearRegression + + return [ + {"regressor": LinearRegression(), "distr_type": "gaussian"}, + {"regressor": LinearRegression(), "distr_type": "laplace"}, + ] diff --git a/skpro/regression/tests/test_baseline_regressors.py b/skpro/regression/tests/test_baseline_regressors.py new file mode 100644 index 000000000..d4f9d7842 --- /dev/null +++ b/skpro/regression/tests/test_baseline_regressors.py @@ -0,0 +1,155 @@ +import importlib.util + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression + +from skpro.regression.deterministic_reduction import DeterministicReductionRegressor +from skpro.regression.unconditional_distfit import UnconditionalDistfitRegressor + +HAS_DISTFIT = importlib.util.find_spec("distfit") is not None + +requires_distfit = pytest.mark.skipif(not HAS_DISTFIT, reason="distfit required") + + +@requires_distfit +def test_unconditional_distfit_regressor_invalid_distr_type(): + with pytest.raises(ValueError, match="distr_type"): + UnconditionalDistfitRegressor(distr_type="not_a_dist") + + +@requires_distfit +def test_unconditional_distfit_regressor_fit_and_predict(): + rng = np.random.default_rng(42) + X = rng.normal(size=(100, 3)) + y = rng.normal(size=100) + + reg = UnconditionalDistfitRegressor(distr_type="norm", random_state=42) + reg.fit(X, y) + dist = reg.predict_proba(X) + + samples = dist.sample(10) + assert samples.shape[0] == 10 * len(X) + assert hasattr(dist, "pdf") + assert hasattr(dist, "mean") + assert hasattr(dist, "var") + + +@requires_distfit +def test_unconditional_distfit_distribution_parameters_and_mean(): + rng = np.random.default_rng(123) + X = rng.normal(size=(120, 2)) + y = rng.normal(loc=2.5, scale=1.3, size=120) + + reg = UnconditionalDistfitRegressor(distr_type="laplace", random_state=123) + reg.fit(X, y) + dist = reg.predict_proba(X) + + model = dist.distfit_obj.model + assert isinstance(model, dict) + assert "loc" in model + assert "scale" in model + # distfit is fit on y only. + # mean of returned distribution should match fitted location. + assert np.allclose(dist.mean().values, model["loc"]) + + +@requires_distfit +def test_unconditional_distfit_regressor_multioutput_raises(): + rng = np.random.default_rng(7) + X = rng.normal(size=(100, 3)) + y = rng.normal(size=(100, 2)) + + reg = UnconditionalDistfitRegressor(distr_type="norm") + with pytest.raises(NotImplementedError, match="univariate"): + reg.fit(X, y) + + +@requires_distfit +def test_unconditional_distfit_regressor_supports_numpy_arrays(): + rng = np.random.default_rng(99) + X = rng.normal(size=(50, 2)) + y = rng.normal(size=50) + + reg = UnconditionalDistfitRegressor(distr_type="norm") + reg.fit(X, y) + dist = reg.predict_proba(X) + + assert hasattr(dist, "mean") + + +def test_deterministic_reduction_regressor_gaussian(): + rng = np.random.default_rng(24) + X = rng.normal(size=(100, 2)) + y = rng.normal(size=100) + + reg = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian") + reg.fit(X, y) + dist = reg.predict_proba(X) + + assert hasattr(dist, "mean") + assert hasattr(dist, "sigma") + assert np.allclose(dist.sigma, np.sqrt(np.var(y))) + + +def test_deterministic_reduction_regressor_invalid_distr_type(): + with pytest.raises(ValueError, match="distr_type"): + DeterministicReductionRegressor(LinearRegression(), distr_type="not_a_dist") + + +def test_deterministic_reduction_regressor_multioutput_raises(): + rng = np.random.default_rng(8) + X = rng.normal(size=(100, 2)) + y = rng.normal(size=(100, 2)) + + reg = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian") + with pytest.raises(NotImplementedError, match="univariate"): + reg.fit(X, y) + + +def test_deterministic_reduction_regressor_supports_numpy_arrays(): + rng = np.random.default_rng(101) + X = rng.normal(size=(50, 2)) + y = rng.normal(size=50) + + reg = DeterministicReductionRegressor(LinearRegression(), distr_type="gaussian") + reg.fit(X, y) + dist = reg.predict_proba(X) + + assert hasattr(dist, "mean") + + +@pytest.mark.parametrize("distr_type", ["gaussian", "laplace"]) +def test_deterministic_reduction_distribution_correctness(distr_type): + rng = np.random.default_rng(202) + X = rng.normal(size=(120, 3)) + y = 1.5 * X[:, 0] - 0.3 * X[:, 1] + rng.normal(scale=0.2, size=120) + + reg = DeterministicReductionRegressor(LinearRegression(), distr_type=distr_type) + reg.fit(X, y) + dist = reg.predict_proba(X) + + pred = reg.regressor_.predict(X).reshape(-1, 1) + mean_df = dist.mean() + + assert np.allclose(mean_df.values, pred) + if distr_type == "gaussian": + assert hasattr(dist, "sigma") + assert np.allclose(dist.sigma, np.sqrt(np.var(y))) + else: + assert hasattr(dist, "scale") + assert np.allclose(dist.scale, np.sqrt(np.var(y) / 2)) + + +def test_deterministic_reduction_regressor_laplace(): + rng = np.random.default_rng(77) + X = rng.normal(size=(100, 2)) + y = rng.normal(size=100) + + reg = DeterministicReductionRegressor(LinearRegression(), distr_type="laplace") + reg.fit(X, y) + dist = reg.predict_proba(X) + + assert hasattr(dist, "mu") + assert hasattr(dist, "scale") + assert np.allclose(dist.scale, np.sqrt(np.var(y) / 2)) diff --git a/skpro/regression/unconditional_distfit.py b/skpro/regression/unconditional_distfit.py new file mode 100644 index 000000000..96ef26dfa --- /dev/null +++ b/skpro/regression/unconditional_distfit.py @@ -0,0 +1,201 @@ +"""Unconditional probabilistic regression baseline using distfit. + +Regressor ignores all features and fits a univariate density to target using distfit. +""" + + +import numpy as np + +from skpro.distributions.base import BaseDistribution +from skpro.regression.base import BaseProbaRegressor + + +class UnconditionalDistfitRegressor(BaseProbaRegressor): + """ + Featureless unconditional probabilistic regressor using distfit. + + Fits a univariate density to the target using distfit, ignoring all features. + Supports parametric (e.g., normal, laplace) and histogram fitting via distfit's API. + Multi-output y is not supported (raises NotImplementedError). + + This is a constant-uncertainty baseline: uncertainty does not shrink with more + data. Requires the optional dependency `distfit` (install with + `pip install distfit`). + + Examples + -------- + >>> import pandas as pd + >>> from skpro.regression.unconditional_distfit import ( + ... UnconditionalDistfitRegressor + ... ) + >>> y = pd.DataFrame([1, 2, 3, 4, 5]) + >>> X = pd.DataFrame(index=y.index) # featureless DataFrame + >>> reg = UnconditionalDistfitRegressor(distr_type="norm") + >>> reg.fit(X, y) + UnconditionalDistfitRegressor() + >>> dist = reg.predict_proba(X) + >>> float(dist.mean().iloc[0, 0]) + 3.0 + + References + ---------- + - mlr3proba: Probabilistic Supervised Learning in R (density estimation). + https://mlr3book.mlr-org.com/chapters/chapter13/beyond_regression_and_classification.html + - LinCDE: Conditional Density Estimation via Lindsey’s Method + (Gao & Hastie, JMLR 2022). https://jmlr.org/papers/volume23/21-0840/21-0840.pdf + - Conditional Density Estimation with Histogram Trees (Yang et al., NeurIPS 2024). + https://arxiv.org/html/2410.11449v1 + - Nonparametric Conditional Density Estimation (Hansen, 2004). + https://users.ssc.wisc.edu/~behansen/papers/ncde.pdf + - distfit documentation: https://erdogant.github.io/distfit/ + """ + + _tags = { + # packaging info + # -------------- + "authors": ["arnavk23"], + "estimator_type": "regressor_proba", + "python_dependencies": "distfit>=1.6.8", + # estimator tags + # -------------- + "capability:multioutput": False, + "capability:missing": True, + "X_inner_mtype": "pd_DataFrame_Table", + "y_inner_mtype": "pd_DataFrame_Table", + # CI and test flags + # ----------------- + "tests:vm": False, # set True if special VM is needed + } + + def __init__(self, distr_type="norm", random_state=None, fit_histogram=False): + """Initialize UnconditionalDistfitRegressor. + + Parameters + ---------- + distr_type : str, default='norm' + Distribution type for distfit (e.g., 'norm', 'laplace'; see distfit docs). + random_state : int or None + Random seed for reproducibility. + fit_histogram : bool, default=False + If True, fit a histogram using distfit's histogram option. + """ + allowed_types = ["norm", "laplace", "histogram"] + if distr_type not in allowed_types: + raise ValueError( + f"distr_type must be one of {allowed_types}, got {distr_type}" + ) + self.distr_type = distr_type + self.random_state = random_state + self.fit_histogram = fit_histogram + super().__init__() + + def _fit(self, X, y, C=None): + # Import distfit only when needed for dependency isolation + from distfit import distfit + + y_arr_raw = y.values if hasattr(y, "values") else np.asarray(y) + if y_arr_raw.ndim > 2 or (y_arr_raw.ndim == 2 and y_arr_raw.shape[1] > 1): + raise NotImplementedError( + "UnconditionalDistfitRegressor only supports univariate y. Got shape: " + + str(y_arr_raw.shape) + ) + + if hasattr(y, "columns"): + self._y_cols = y.columns + else: + self._y_cols = ["0"] + + y_arr = np.asarray(y_arr_raw).reshape(-1) + if self.distr_type == "kde": + raise RuntimeError( + "KDE support is removed due to scipy.stats.kde deprecation in distfit. " + "Please use a different distribution type." + ) + if self.fit_histogram: + raise RuntimeError( + "Histogram support is not available in distfit>=2.0.1. " + "Please set fit_histogram=False and use a parametric distr_type " + "such as 'norm' or 'laplace'." + ) + else: + self.distfit_ = distfit( + distr=self.distr_type, random_state=self.random_state + ) + self.distfit_.fit_transform(y_arr) + return self + + def _predict_proba(self, X): + # Return one-row-per-instance distribution with y-aligned columns. + return _DistfitDistribution(self.distfit_, index=X.index, columns=self._y_cols) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter sets for automated tests.""" + return [ + {"distr_type": "norm", "fit_histogram": False}, + {"distr_type": "laplace", "fit_histogram": False}, + ] + + +class _DistfitDistribution(BaseDistribution): + """Wraps a distfit fitted object as a skpro distribution.""" + + def __init__(self, distfit_obj, index=None, columns=None, distr_type=None): + if isinstance(distfit_obj, np.ndarray): + distfit_obj = distfit_obj.item() + + self.distfit_obj = distfit_obj + if distr_type is None: + distr_type = getattr(self.distfit_obj, "distr", None) + self.distr_type = distr_type + super().__init__(index=index, columns=columns) + + def _get_fitted_model(self): + """Return fitted scipy frozen distribution when available.""" + model = self.distfit_obj.model + if isinstance(model, dict): + model = model.get("model", model) + return model + + def _get_scalar_mean(self): + """Return scalar mean for the fitted distribution.""" + model = self.distfit_obj.model + if isinstance(model, dict): + if "loc" in model: + return float(model["loc"]) + if "mean" in model: + return float(model["mean"]) + fitted = self._get_fitted_model() + return float(fitted.mean()) + + def _get_scalar_var(self): + """Return scalar variance for the fitted distribution.""" + model = self.distfit_obj.model + if isinstance(model, dict) and "scale" in model: + return float(model["scale"]) ** 2 + fitted = self._get_fitted_model() + return float(fitted.var()) + + def _mean(self): + return np.full(self.shape, self._get_scalar_mean(), dtype=float) + + def _var(self): + return np.full(self.shape, self._get_scalar_var(), dtype=float) + + def _pdf(self, x): + fitted = self._get_fitted_model() + return fitted.pdf(x) + + def _cdf(self, x): + fitted = self._get_fitted_model() + return fitted.cdf(x) + + def _ppf(self, p): + fitted = self._get_fitted_model() + return fitted.ppf(p) + + def get_params(self, deep=True): + """Return parameters of the distribution.""" + # Example: expose distfit_obj and its distribution type if available + distr_type = getattr(self.distfit_obj, "distr", None) + return {"distfit_obj": self.distfit_obj, "distr_type": distr_type}