From de8bd3ea4d21ab8b3db54019b44ce527ca8aa735 Mon Sep 17 00:00:00 2001 From: Khaoula Riad Date: Wed, 10 Jun 2026 11:31:03 +0200 Subject: [PATCH 1/9] add skpipeline to tabular pipeline --- skrub/_tabular_pipeline.py | 8 ++++++++ skrub/tests/test_tabular_pipeline.py | 24 ++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/skrub/_tabular_pipeline.py b/skrub/_tabular_pipeline.py index ac1c01244..922da2402 100644 --- a/skrub/_tabular_pipeline.py +++ b/skrub/_tabular_pipeline.py @@ -1,6 +1,7 @@ from sklearn import ensemble from sklearn.base import BaseEstimator from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline as skpipeline from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OrdinalEncoder @@ -48,6 +49,7 @@ def tabular_pipeline(estimator, *, n_jobs=None): Parameters ---------- estimator : {"regressor", "regression", "classifier", "classification"} or sklearn.base.BaseEstimator + or sklearn.pipeline.Pipeline The estimator to use as the final step in the pipeline. Based on the type of estimator, the previous preprocessing steps and their respective parameters are chosen. The possible values are: @@ -59,6 +61,7 @@ def tabular_pipeline(estimator, *, n_jobs=None): :obj:`~sklearn.ensemble.HistGradientBoostingClassifier` is used as the final step; - a scikit-learn estimator: the provided estimator is used as the final step. + - a scikit-learn pipeline : the last step of the pipeline is the estimator used as the final step. n_jobs : int, default=None Number of jobs to run in parallel in the :obj:`TableVectorizer` step. ``None`` @@ -225,6 +228,11 @@ def tabular_pipeline(estimator, *, n_jobs=None): vectorizer = TableVectorizer(n_jobs=n_jobs) cat_feat_kwargs = {"categorical_features": "from_dtype"} + if isinstance(estimator, skpipeline): + return tabular_pipeline( + estimator.steps[-1][-1], + n_jobs=n_jobs, + ) if isinstance(estimator, str): if estimator in ("classifier", "classification"): return tabular_pipeline( diff --git a/skrub/tests/test_tabular_pipeline.py b/skrub/tests/test_tabular_pipeline.py index 42dd17504..9d8d6438f 100644 --- a/skrub/tests/test_tabular_pipeline.py +++ b/skrub/tests/test_tabular_pipeline.py @@ -1,7 +1,8 @@ import pytest from sklearn import ensemble from sklearn.impute import SimpleImputer -from sklearn.linear_model import Ridge +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.pipeline import Pipeline as skpipeline from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from skrub import ( @@ -14,7 +15,13 @@ @pytest.mark.parametrize( - "learner_kind", ["regressor", "regression", "classifier", "classification"] + "learner_kind", + [ + "regressor", + "regression", + "classifier", + "classification", + ], ) def test_default_pipeline(learner_kind): p = tabular_pipeline(learner_kind) @@ -74,3 +81,16 @@ def test_from_dtype(): ensemble.HistGradientBoostingRegressor(categorical_features="from_dtype") ) assert isinstance(p.named_steps["tablevectorizer"].low_cardinality, ToCategorical) + + +def test_skpipeline_learner(): + original_learner = LogisticRegression() + sk_pipeline = skpipeline([("imputer", SimpleImputer()), ("clf", original_learner)]) + p = tabular_pipeline(sk_pipeline) + tv, imputer, scaler, learner = (e for _, e in p.steps) + assert learner is original_learner + assert isinstance(tv.high_cardinality, StringEncoder) + assert isinstance(tv.low_cardinality, OneHotEncoder) + assert isinstance(imputer, SimpleImputer) + assert isinstance(scaler, SquashingScaler) + assert tv.datetime.periodic_encoding == "spline" From 2961a8aef7e2fa9469920c098a8458f467ddbad0 Mon Sep 17 00:00:00 2001 From: Khaoula Riad Date: Wed, 10 Jun 2026 14:14:57 +0200 Subject: [PATCH 2/9] feat : add usual parameter if skpipeline is passed --- skrub/_tabular_pipeline.py | 39 +++++++++++++++------------- skrub/tests/test_tabular_pipeline.py | 11 +++++--- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/skrub/_tabular_pipeline.py b/skrub/_tabular_pipeline.py index 922da2402..46c4ab6a0 100644 --- a/skrub/_tabular_pipeline.py +++ b/skrub/_tabular_pipeline.py @@ -1,8 +1,7 @@ from sklearn import ensemble from sklearn.base import BaseEstimator from sklearn.impute import SimpleImputer -from sklearn.pipeline import Pipeline as skpipeline -from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import OrdinalEncoder from ._datetime_encoder import DatetimeEncoder @@ -227,19 +226,18 @@ def tabular_pipeline(estimator, *, n_jobs=None): """ # noqa: E501 vectorizer = TableVectorizer(n_jobs=n_jobs) cat_feat_kwargs = {"categorical_features": "from_dtype"} + if isinstance(estimator, Pipeline): + estimator_ = estimator[-1] + else: + estimator_ = estimator - if isinstance(estimator, skpipeline): - return tabular_pipeline( - estimator.steps[-1][-1], - n_jobs=n_jobs, - ) - if isinstance(estimator, str): - if estimator in ("classifier", "classification"): + if isinstance(estimator_, str): + if estimator_ in ("classifier", "classification"): return tabular_pipeline( ensemble.HistGradientBoostingClassifier(**cat_feat_kwargs), n_jobs=n_jobs, ) - if estimator in ("regressor", "regression"): + if estimator_ in ("regressor", "regression"): return tabular_pipeline( ensemble.HistGradientBoostingRegressor(**cat_feat_kwargs), n_jobs=n_jobs, @@ -248,20 +246,20 @@ def tabular_pipeline(estimator, *, n_jobs=None): "If ``estimator`` is a string it should be 'regressor', 'regression'," " 'classifier' or 'classification'." ) - if isinstance(estimator, type) and issubclass(estimator, BaseEstimator): + if isinstance(estimator_, type) and issubclass(estimator, BaseEstimator): raise TypeError( "tabular_pipeline expects a scikit-learn estimator as its first" - f" argument. Pass an instance of {estimator.__name__} rather than the class" - " itself." + f" argument. Pass an instance of {estimator_.__name__} rather than" + " the class itself." ) - if not isinstance(estimator, BaseEstimator): + if not isinstance(estimator_, BaseEstimator): raise TypeError( "tabular_pipeline expects a scikit-learn estimator, 'regressor'," " or 'classifier' as its first argument." ) if ( - isinstance(estimator, _HGBT_CLASSES) + isinstance(estimator_, _HGBT_CLASSES) and getattr(estimator, "categorical_features", None) == "from_dtype" ): vectorizer.set_params( @@ -278,10 +276,15 @@ def tabular_pipeline(estimator, *, n_jobs=None): ) else: vectorizer.set_params(datetime=DatetimeEncoder(periodic_encoding="spline")) + steps = [vectorizer] - if not get_tags(estimator).input_tags.allow_nan: + if not get_tags(estimator_).input_tags.allow_nan: steps.append(SimpleImputer(add_indicator=True)) - if not isinstance(estimator, _TREE_ENSEMBLE_CLASSES): + if not isinstance(estimator_, _TREE_ENSEMBLE_CLASSES): steps.append(SquashingScaler(max_absolute_value=5)) - steps.append(estimator) + if isinstance(estimator, Pipeline): + steps_pipeline = estimator.steps.copy() + steps.extend(steps_pipeline) + else: + steps.append(estimator_) return make_pipeline(*steps) diff --git a/skrub/tests/test_tabular_pipeline.py b/skrub/tests/test_tabular_pipeline.py index 9d8d6438f..1fc83d1ae 100644 --- a/skrub/tests/test_tabular_pipeline.py +++ b/skrub/tests/test_tabular_pipeline.py @@ -1,8 +1,9 @@ import pytest from sklearn import ensemble +from sklearn.decomposition import PCA from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression, Ridge -from sklearn.pipeline import Pipeline as skpipeline +from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from skrub import ( @@ -85,12 +86,14 @@ def test_from_dtype(): def test_skpipeline_learner(): original_learner = LogisticRegression() - sk_pipeline = skpipeline([("imputer", SimpleImputer()), ("clf", original_learner)]) + sk_pipeline = Pipeline([("pca", PCA()), ("clf", original_learner)]) p = tabular_pipeline(sk_pipeline) - tv, imputer, scaler, learner = (e for _, e in p.steps) - assert learner is original_learner + assert len([e for _, e in p.steps]) == 5 + tv, imputer, scaler, pca, learner = (e for _, e in p.steps) + assert learner[-1] is original_learner assert isinstance(tv.high_cardinality, StringEncoder) assert isinstance(tv.low_cardinality, OneHotEncoder) assert isinstance(imputer, SimpleImputer) assert isinstance(scaler, SquashingScaler) + assert isinstance(pca[-1], PCA) assert tv.datetime.periodic_encoding == "spline" From e5406db4e0fb02584b5e4f6c7277053a8bee59a6 Mon Sep 17 00:00:00 2001 From: Khaoula Riad Date: Wed, 10 Jun 2026 14:27:48 +0200 Subject: [PATCH 3/9] add pull request number to changes.rst --- CHANGES.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index fce0bc259..60c94da53 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -54,6 +54,8 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. +- Added usual parameters when sklearn pipeline is passed to tabular_pipeline + :pr:`7805` by :user:`Khaoula Riad and Marine Michaut`. Bugfixes -------- From 37fc38619fd3a92c9432299f66147389a9535bd8 Mon Sep 17 00:00:00 2001 From: Khaoula Riad Date: Wed, 10 Jun 2026 14:43:02 +0200 Subject: [PATCH 4/9] correct doc --- doc/modules/default_wrangling/tabular_pipeline.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/default_wrangling/tabular_pipeline.rst b/doc/modules/default_wrangling/tabular_pipeline.rst index d2b3067eb..b096d6202 100644 --- a/doc/modules/default_wrangling/tabular_pipeline.rst +++ b/doc/modules/default_wrangling/tabular_pipeline.rst @@ -134,12 +134,12 @@ the default table preprocessing: >>> from sklearn.decomposition import PCA >>> from sklearn.linear_model import Ridge ->>> from sklearn.pipeline import make_pipeline +>>> from sklearn.pipeline import Pipeline >>> from skrub import tabular_pipeline ->>> model_pipeline = make_pipeline(PCA(n_components=20), Ridge()) +>>> model_pipeline = Pipeline([("pca", PCA(n_components=20)), ("ridge", Ridge())]) >>> full_pipeline = tabular_pipeline(model_pipeline) >>> [name for name, _ in full_pipeline.steps] -['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pipeline'] +['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pca', 'ridge'] The user-provided estimator pipeline is appended as a single final step. This means that ``tabular_pipeline`` can still decide which preprocessing steps to From 26ad5e6dd232ebbf0c7868b102ed377df3711ca0 Mon Sep 17 00:00:00 2001 From: Khaoula Riad Date: Wed, 10 Jun 2026 14:59:39 +0200 Subject: [PATCH 5/9] proper PR number and comments taken into account --- CHANGES.rst | 5 +++-- skrub/_tabular_pipeline.py | 3 ++- skrub/tests/test_tabular_pipeline.py | 11 +++-------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 60c94da53..c177ac017 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -40,6 +40,9 @@ New Features Additionally, negative numbers indicated with parentheses can be converted to the regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela Gómez Jiménez `. +- Added usual parameters when scikit-learn pipeline is passed to tabular_pipeline + :pr:`2152` by :user:`Khaoula Riad and Marine Michaut`. + Changes ------- @@ -54,8 +57,6 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. -- Added usual parameters when sklearn pipeline is passed to tabular_pipeline - :pr:`7805` by :user:`Khaoula Riad and Marine Michaut`. Bugfixes -------- diff --git a/skrub/_tabular_pipeline.py b/skrub/_tabular_pipeline.py index 46c4ab6a0..2458d8c78 100644 --- a/skrub/_tabular_pipeline.py +++ b/skrub/_tabular_pipeline.py @@ -60,7 +60,8 @@ def tabular_pipeline(estimator, *, n_jobs=None): :obj:`~sklearn.ensemble.HistGradientBoostingClassifier` is used as the final step; - a scikit-learn estimator: the provided estimator is used as the final step. - - a scikit-learn pipeline : the last step of the pipeline is the estimator used as the final step. + - a scikit-learn pipeline : the whole pipeline is kept and usual parameters are added depending + on the estimator in the last step of the pipeline. n_jobs : int, default=None Number of jobs to run in parallel in the :obj:`TableVectorizer` step. ``None`` diff --git a/skrub/tests/test_tabular_pipeline.py b/skrub/tests/test_tabular_pipeline.py index 1fc83d1ae..64b8cf656 100644 --- a/skrub/tests/test_tabular_pipeline.py +++ b/skrub/tests/test_tabular_pipeline.py @@ -87,13 +87,8 @@ def test_from_dtype(): def test_skpipeline_learner(): original_learner = LogisticRegression() sk_pipeline = Pipeline([("pca", PCA()), ("clf", original_learner)]) - p = tabular_pipeline(sk_pipeline) - assert len([e for _, e in p.steps]) == 5 - tv, imputer, scaler, pca, learner = (e for _, e in p.steps) + tab_pipeline = tabular_pipeline(sk_pipeline) + assert len([e for _, e in tab_pipeline.steps]) == 5 + tv, imputer, scaler, pca, learner = (e for _, e in tab_pipeline.steps) assert learner[-1] is original_learner - assert isinstance(tv.high_cardinality, StringEncoder) - assert isinstance(tv.low_cardinality, OneHotEncoder) - assert isinstance(imputer, SimpleImputer) - assert isinstance(scaler, SquashingScaler) assert isinstance(pca[-1], PCA) - assert tv.datetime.periodic_encoding == "spline" From 30a9019303ad25ab709b8573508d4fb52ef8861b Mon Sep 17 00:00:00 2001 From: Khaoula Riad Date: Wed, 10 Jun 2026 15:14:18 +0200 Subject: [PATCH 6/9] correct accessing piepline --- doc/modules/default_wrangling/tabular_pipeline.rst | 4 ++-- skrub/_tabular_pipeline.py | 2 +- skrub/tests/test_tabular_pipeline.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/modules/default_wrangling/tabular_pipeline.rst b/doc/modules/default_wrangling/tabular_pipeline.rst index b096d6202..74045e4a2 100644 --- a/doc/modules/default_wrangling/tabular_pipeline.rst +++ b/doc/modules/default_wrangling/tabular_pipeline.rst @@ -134,9 +134,9 @@ the default table preprocessing: >>> from sklearn.decomposition import PCA >>> from sklearn.linear_model import Ridge ->>> from sklearn.pipeline import Pipeline +>>> from sklearn.pipeline import make_pipeline >>> from skrub import tabular_pipeline ->>> model_pipeline = Pipeline([("pca", PCA(n_components=20)), ("ridge", Ridge())]) +>>> model_pipeline = make_pipeline(PCA(n_components=20), Ridge()) >>> full_pipeline = tabular_pipeline(model_pipeline) >>> [name for name, _ in full_pipeline.steps] ['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pca', 'ridge'] diff --git a/skrub/_tabular_pipeline.py b/skrub/_tabular_pipeline.py index 2458d8c78..f6a405fc2 100644 --- a/skrub/_tabular_pipeline.py +++ b/skrub/_tabular_pipeline.py @@ -284,7 +284,7 @@ def tabular_pipeline(estimator, *, n_jobs=None): if not isinstance(estimator_, _TREE_ENSEMBLE_CLASSES): steps.append(SquashingScaler(max_absolute_value=5)) if isinstance(estimator, Pipeline): - steps_pipeline = estimator.steps.copy() + steps_pipeline = [sp for _, sp in estimator.steps] steps.extend(steps_pipeline) else: steps.append(estimator_) diff --git a/skrub/tests/test_tabular_pipeline.py b/skrub/tests/test_tabular_pipeline.py index 64b8cf656..231d54421 100644 --- a/skrub/tests/test_tabular_pipeline.py +++ b/skrub/tests/test_tabular_pipeline.py @@ -90,5 +90,5 @@ def test_skpipeline_learner(): tab_pipeline = tabular_pipeline(sk_pipeline) assert len([e for _, e in tab_pipeline.steps]) == 5 tv, imputer, scaler, pca, learner = (e for _, e in tab_pipeline.steps) - assert learner[-1] is original_learner - assert isinstance(pca[-1], PCA) + assert learner is original_learner + assert isinstance(pca, PCA) From 0a78b9ffb4ded43298c97333efb8b5646d7b994c Mon Sep 17 00:00:00 2001 From: khaoulariad Date: Wed, 10 Jun 2026 15:51:29 +0200 Subject: [PATCH 7/9] Update skrub/_tabular_pipeline.py correcting doc Co-authored-by: Marie Sacksick <79304610+MarieSacksick@users.noreply.github.com> --- skrub/_tabular_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/_tabular_pipeline.py b/skrub/_tabular_pipeline.py index f6a405fc2..6cd69078b 100644 --- a/skrub/_tabular_pipeline.py +++ b/skrub/_tabular_pipeline.py @@ -60,8 +60,8 @@ def tabular_pipeline(estimator, *, n_jobs=None): :obj:`~sklearn.ensemble.HistGradientBoostingClassifier` is used as the final step; - a scikit-learn estimator: the provided estimator is used as the final step. - - a scikit-learn pipeline : the whole pipeline is kept and usual parameters are added depending - on the estimator in the last step of the pipeline. + - a scikit-learn pipeline : the whole pipeline is kept and usual pre-processing by the TableReport + is added on top, depending on the estimator in the last step of the pipeline. n_jobs : int, default=None Number of jobs to run in parallel in the :obj:`TableVectorizer` step. ``None`` From 3f24e2b32bc9956b5e904608a3704f3eb78b4e3f Mon Sep 17 00:00:00 2001 From: khaoulariad Date: Wed, 10 Jun 2026 15:52:50 +0200 Subject: [PATCH 8/9] Update CHANGES.rst Co-authored-by: Marie Sacksick <79304610+MarieSacksick@users.noreply.github.com> --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index c177ac017..804373ea7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -40,7 +40,7 @@ New Features Additionally, negative numbers indicated with parentheses can be converted to the regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela Gómez Jiménez `. -- Added usual parameters when scikit-learn pipeline is passed to tabular_pipeline +- TabularPipeline now uses the estimator when given a pipeline to determine the parameters of the TableVectorizer :pr:`2152` by :user:`Khaoula Riad and Marine Michaut`. From 5dc4fa5b0e9f2bcf4bbfbaf00ff0449e8615512f Mon Sep 17 00:00:00 2001 From: khaoulariad Date: Wed, 10 Jun 2026 16:06:11 +0200 Subject: [PATCH 9/9] Update skrub/tests/test_tabular_pipeline.py Co-authored-by: Marie Sacksick <79304610+MarieSacksick@users.noreply.github.com> --- skrub/tests/test_tabular_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/tests/test_tabular_pipeline.py b/skrub/tests/test_tabular_pipeline.py index 231d54421..6731211c7 100644 --- a/skrub/tests/test_tabular_pipeline.py +++ b/skrub/tests/test_tabular_pipeline.py @@ -88,7 +88,7 @@ def test_skpipeline_learner(): original_learner = LogisticRegression() sk_pipeline = Pipeline([("pca", PCA()), ("clf", original_learner)]) tab_pipeline = tabular_pipeline(sk_pipeline) - assert len([e for _, e in tab_pipeline.steps]) == 5 - tv, imputer, scaler, pca, learner = (e for _, e in tab_pipeline.steps) + assert len([element for _, element in tab_pipeline.steps]) == 5 + tv, imputer, scaler, pca, learner = (element for _, element in tab_pipeline.steps) assert learner is original_learner assert isinstance(pca, PCA)