diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst index d312b46a1..c0aa98bf5 100644 --- a/docs/source/api_reference/regression.rst +++ b/docs/source/api_reference/regression.rst @@ -24,6 +24,8 @@ Composition Pipeline TransformedTargetRegressor + TargetTransform + DistrPredictiveCalibration Model selection and tuning -------------------------- diff --git a/skpro/distributions/tests/test_proba_basic.py b/skpro/distributions/tests/test_proba_basic.py index 3cb1bf26d..60341d7a8 100644 --- a/skpro/distributions/tests/test_proba_basic.py +++ b/skpro/distributions/tests/test_proba_basic.py @@ -112,8 +112,13 @@ def test_proba_index_coercion(): @pytest.mark.parametrize("fun", ["pdf", "ppf", "cdf"]) def test_proba_plotting(fun): """Test that plotting functions do not crash and return ax as expected.""" + import matplotlib + + matplotlib.use("Agg", force=True) + from matplotlib.axes import Axes from matplotlib.figure import Figure + from matplotlib.pyplot import close from skpro.distributions.normal import Normal @@ -125,6 +130,7 @@ def test_proba_plotting(fun): assert ax.shape == n.shape assert all([isinstance(a, Axes) for a in ax.flatten()]) assert all([a.get_figure() == fig for a in ax.flatten()]) + close(fig) # 1D case requires special treatment of axes n = Normal(mu=[[1], [2], [3]], sigma=1) @@ -134,11 +140,13 @@ def test_proba_plotting(fun): assert ax.shape == (n.shape[0],) assert all([isinstance(a, Axes) for a in ax.flatten()]) assert all([a.get_figure() == fig for a in ax.flatten()]) + close(fig) # scalar case n = Normal(mu=1, sigma=1) ax = n.plot(fun=fun) assert isinstance(ax, Axes) + close(ax.figure) @pytest.mark.skip(reason="Undiagnosed failure. Skipping until resolved. See #918.") diff --git a/skpro/regression/compose/__init__.py b/skpro/regression/compose/__init__.py index ae972a3bb..07b014893 100644 --- a/skpro/regression/compose/__init__.py +++ b/skpro/regression/compose/__init__.py @@ -2,8 +2,14 @@ from skpro.regression.compose._pipeline import Pipeline from skpro.regression.compose._ttr import TransformedTargetRegressor +from skpro.regression.compose.distr_predictive_calibration import ( + DistrPredictiveCalibration, +) +from skpro.regression.compose.target_transform import TargetTransform __all__ = [ "Pipeline", "TransformedTargetRegressor", + "TargetTransform", + "DistrPredictiveCalibration", ] diff --git a/skpro/regression/compose/distr_predictive_calibration.py b/skpro/regression/compose/distr_predictive_calibration.py new file mode 100644 index 000000000..964e680f2 --- /dev/null +++ b/skpro/regression/compose/distr_predictive_calibration.py @@ -0,0 +1,134 @@ +"""Implements predictive target calibration for probabilistic regression.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["arnavk23"] +__all__ = ["DistrPredictiveCalibration"] + +from sklearn.base import BaseEstimator + +from skpro.regression.base import BaseProbaRegressor + + +class _IdentityCalibrator(BaseEstimator): + """Identity calibrator used in estimator checks for test instance creation. + + Kept at module scope so sklearn cloning and serialization remain robust in tests. + """ + + def fit(self, y_true, y_pred): + return self + + def transform(self, y_pred): + return y_pred + + +class _ScaleOnlyCalibrator(BaseEstimator): + """Simple deterministic calibrator for estimator check parametrization.""" + + def __init__(self, scale=1.1): + self.scale = scale + + def fit(self, y_true, y_pred): + return self + + def transform(self, y_pred): + return y_pred + + +class DistrPredictiveCalibration(BaseProbaRegressor): + """DistrPredictiveCalibration pipeline for predictive target calibration. + + Wraps a probabilistic regressor and applies a calibration method + to its predicted distributions. + + Parameters + ---------- + regressor : BaseProbaRegressor + The probabilistic regressor to wrap. + calibrator : object + The calibration method to apply to predicted distributions. + Must implement fit(y_true, y_pred) and transform(y_pred). + + Examples + -------- + >>> from skpro.regression.compose import DistrPredictiveCalibration + >>> from skpro.regression.residual import ResidualDouble + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.model_selection import train_test_split + >>> import pandas as pd + >>> # Dummy calibrator for demonstration + >>> from sklearn.base import BaseEstimator, TransformerMixin + >>> class DummyCalibrator(BaseEstimator, TransformerMixin): + ... def fit(self, y_true, y_pred): + ... return self + ... def transform(self, y_pred): + ... return y_pred + >>> # Load data + >>> X, y = load_diabetes(return_X_y=True, as_frame=True) + >>> y = pd.DataFrame(y) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) + >>> reg = ResidualDouble.create_test_instance() + >>> cal = DummyCalibrator() + >>> calreg = DistrPredictiveCalibration(regressor=reg, calibrator=cal) + >>> calreg.fit(X_train, y_train) + DistrPredictiveCalibration(...) + >>> y_pred = calreg.predict(X_test) + >>> y_pred_proba = calreg.predict_proba(X_test) + >>> # Note: Calibrator must accept and return distribution objects + >>> # as output from predict_proba. + """ + + _tags = { + "capability:multioutput": True, + "capability:missing": True, + } + + def __init__(self, regressor, calibrator): + self.regressor = regressor + self.calibrator = calibrator + super().__init__() + + def _fit(self, X, y, C=None): + from sklearn.base import clone + + # Clone regressor and calibrator to avoid mutating input parameters + self._fitted_regressor = clone(self.regressor) + self._fitted_regressor.fit(X, y, C=C) + self._fitted_calibrator = clone(self.calibrator) + # Fit calibrator on training predictions + y_pred = self._fitted_regressor.predict_proba(X) + self._fitted_calibrator.fit(y, y_pred) + return self + + def _predict(self, X): + return self._fitted_regressor.predict(X) + + def _predict_quantiles(self, X, alpha): + y_pred = self._fitted_regressor.predict_quantiles(X, alpha) + return self._fitted_calibrator.transform(y_pred) + + def _predict_interval(self, X, coverage): + y_pred = self._fitted_regressor.predict_interval(X, coverage) + return self._fitted_calibrator.transform(y_pred) + + def _predict_var(self, X): + y_pred = self._fitted_regressor.predict_var(X) + return self._fitted_calibrator.transform(y_pred) + + def _predict_proba(self, X): + y_pred = self._fitted_regressor.predict_proba(X) + return self._fitted_calibrator.transform(y_pred) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter sets for automated tests. + + Uses explicit calibrators to exercise constructor and set/get param checks. + """ + from skpro.regression.residual import ResidualDouble + + reg = ResidualDouble.create_test_instance() + return [ + {"regressor": reg, "calibrator": _IdentityCalibrator()}, + {"regressor": reg, "calibrator": _ScaleOnlyCalibrator(scale=1.05)}, + ] diff --git a/skpro/regression/compose/target_transform.py b/skpro/regression/compose/target_transform.py new file mode 100644 index 000000000..e122517ae --- /dev/null +++ b/skpro/regression/compose/target_transform.py @@ -0,0 +1,98 @@ +"""Implements target transformation pipeline element for probabilistic regression.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["arnavk23"] +__all__ = ["TargetTransform"] + +from skpro.regression.base import BaseProbaRegressor +from skpro.regression.compose._ttr import TransformedTargetRegressor + + +class TargetTransform(BaseProbaRegressor): + """TargetTransform pipeline for target variable transformation. + + Wraps a regressor and a transformer, applying the transformer to y + during fit and inverse-transforming predictions. + Uses TransformedTargetRegressor internally. + + Parameters + ---------- + regressor : BaseProbaRegressor + The probabilistic regressor to wrap. + transformer : sklearn-like transformer + The transformer to apply to the target variable. + + Examples + -------- + >>> from skpro.regression.compose import TargetTransform + >>> from skpro.regression.residual import ResidualDouble + >>> from sklearn.preprocessing import StandardScaler, MinMaxScaler + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.model_selection import train_test_split + >>> import pandas as pd + >>> # Load data + >>> X, y = load_diabetes(return_X_y=True, as_frame=True) + >>> y = pd.DataFrame(y) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) + >>> # Create a probabilistic regressor + >>> reg = ResidualDouble.create_test_instance() + >>> # Use StandardScaler for target transformation + >>> ttr = TargetTransform(regressor=reg, transformer=StandardScaler()) + >>> ttr.fit(X_train, y_train) + TargetTransform(...) + >>> y_pred = ttr.predict(X_test) + >>> y_pred_proba = ttr.predict_proba(X_test) + >>> # Use MinMaxScaler for target transformation + >>> ttr2 = TargetTransform(regressor=reg, transformer=MinMaxScaler()) + >>> ttr2.fit(X_train, y_train) + TargetTransform(...) + >>> y_pred2 = ttr2.predict(X_test) + """ + + _tags = { + "capability:multioutput": True, + "capability:missing": True, + } + + def __init__(self, regressor, transformer): + self.regressor = regressor + self.transformer = transformer + self._ttr = TransformedTargetRegressor( + regressor=regressor, transformer=transformer + ) + super().__init__() + + def _fit(self, X, y, C=None): + self._ttr.fit(X, y, C=C) + return self + + def _predict(self, X): + return self._ttr.predict(X) + + def _predict_quantiles(self, X, alpha): + return self._ttr.predict_quantiles(X, alpha) + + def _predict_interval(self, X, coverage): + return self._ttr.predict_interval(X, coverage) + + def _predict_var(self, X): + return self._ttr.predict_var(X) + + def _predict_proba(self, X): + return self._ttr.predict_proba(X) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter sets for automated tests. + + Returns two parameter sets: one with StandardScaler, one with MinMaxScaler. + """ + from sklearn.preprocessing import MinMaxScaler, StandardScaler + + from skpro.regression.residual import ResidualDouble + + reg = ResidualDouble.create_test_instance() + return [ + {"regressor": reg, "transformer": StandardScaler()}, + {"regressor": reg, "transformer": MinMaxScaler()}, + ] diff --git a/skpro/regression/tests/test_target_transform_calibration.py b/skpro/regression/tests/test_target_transform_calibration.py new file mode 100644 index 000000000..cc8293192 --- /dev/null +++ b/skpro/regression/tests/test_target_transform_calibration.py @@ -0,0 +1,124 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.base import BaseEstimator, clone +from sklearn.datasets import load_diabetes +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler + +from skpro.distributions.normal import Normal +from skpro.regression.compose import DistrPredictiveCalibration, TargetTransform +from skpro.regression.residual import ResidualDouble + + +class _ShiftScaleCalibrator(BaseEstimator): + """Calibrator that shifts location and scales spread for test assertions.""" + + def __init__(self, offset=5.0, spread_mult=1.25): + self.offset = offset + self.spread_mult = spread_mult + + def fit(self, y_true, y_pred): + self.shift_ = float(self.offset) + return self + + def transform(self, y_pred): + if hasattr(y_pred, "mu") and hasattr(y_pred, "sigma"): + mu = np.asarray(y_pred.mu) + self.shift_ + sigma = np.asarray(y_pred.sigma) * self.spread_mult + return Normal( + mu=mu, sigma=sigma, index=y_pred.index, columns=y_pred.columns + ) + if isinstance(y_pred, pd.DataFrame): + return y_pred * self.spread_mult + self.shift_ + return y_pred + + +@pytest.fixture +def diabetes_split(): + X, y = load_diabetes(return_X_y=True, as_frame=True) + y = pd.DataFrame(y) + return train_test_split(X, y, test_size=0.3, random_state=42) + + +def _qnorm_84(): + return 0.8413447460685429 + + +def test_target_transform_standard_scaler_inverse_transforms_location_and_scale( + diabetes_split, +): + X_train, X_test, y_train, _ = diabetes_split + + base_reg = ResidualDouble(estimator=LinearRegression()) + scaler = StandardScaler() + + wrapped = TargetTransform(regressor=clone(base_reg), transformer=StandardScaler()) + wrapped.fit(X_train, y_train) + + y_train_t = pd.DataFrame( + scaler.fit_transform(y_train), index=y_train.index, columns=y_train.columns + ) + manual_reg = clone(base_reg) + manual_reg.fit(X_train, y_train_t) + + wrapped_dist = wrapped.predict_proba(X_test) + manual_dist_t = manual_reg.predict_proba(X_test) + + scale = float(scaler.scale_[0]) + mean = float(scaler.mean_[0]) + + expected_mu = np.asarray(manual_dist_t.mu) * scale + mean + expected_sigma = np.asarray(manual_dist_t.sigma) * scale + + wrapped_median = wrapped_dist.ppf(0.5).to_numpy() + wrapped_q84 = wrapped_dist.ppf(_qnorm_84()).to_numpy() + wrapped_sigma = wrapped_q84 - wrapped_median + + assert np.allclose(wrapped_median, expected_mu, rtol=1e-6, atol=1e-6) + assert np.allclose(wrapped_sigma, expected_sigma, rtol=1e-6, atol=1e-6) + + +def test_distr_predictive_calibration_modifies_predicted_distribution(diabetes_split): + X_train, X_test, y_train, _ = diabetes_split + + base_reg = ResidualDouble(estimator=LinearRegression()) + base_reg.fit(X_train, y_train) + + calibrated = DistrPredictiveCalibration( + regressor=ResidualDouble(estimator=LinearRegression()), + calibrator=_ShiftScaleCalibrator(spread_mult=1.4), + ) + calibrated.fit(X_train, y_train) + + before = base_reg.predict_proba(X_test) + after = calibrated.predict_proba(X_test) + + before_median = before.ppf(0.5).to_numpy() + after_median = after.ppf(0.5).to_numpy() + + before_spread = before.ppf(_qnorm_84()).to_numpy() - before_median + after_spread = after.ppf(_qnorm_84()).to_numpy() - after_median + + assert not np.allclose(after_median, before_median) + assert not np.allclose(after_spread, before_spread) + + +def test_distr_predictive_calibration_modifies_quantile_predictions(diabetes_split): + X_train, X_test, y_train, _ = diabetes_split + + base_reg = ResidualDouble(estimator=LinearRegression()) + base_reg.fit(X_train, y_train) + + calibrated = DistrPredictiveCalibration( + regressor=ResidualDouble(estimator=LinearRegression()), + calibrator=_ShiftScaleCalibrator(spread_mult=1.3), + ) + calibrated.fit(X_train, y_train) + + alpha = [0.1, 0.5, 0.9] + before_q = base_reg.predict_quantiles(X_test, alpha=alpha) + after_q = calibrated.predict_quantiles(X_test, alpha=alpha) + + assert not np.allclose(after_q.to_numpy(), before_q.to_numpy())