skrub-data · khaoulariad · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -40,6 +40,9 @@ New Features
   Additionally, negative numbers indicated with parentheses can be converted to the
   regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela
   Gómez Jiménez <gabrielapgomezji>`.
+- Added usual parameters when scikit-learn pipeline is passed to tabular_pipeline
+  :pr:`2152` by :user:`Khaoula Riad and Marine Michaut`.
+
 
 Changes
 -------

diff --git a/doc/modules/default_wrangling/tabular_pipeline.rst b/doc/modules/default_wrangling/tabular_pipeline.rst
@@ -139,7 +139,7 @@ the default table preprocessing:
 >>> model_pipeline = make_pipeline(PCA(n_components=20), Ridge())
 >>> full_pipeline = tabular_pipeline(model_pipeline)
 >>> [name for name, _ in full_pipeline.steps]
-['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pipeline']
+['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pca', 'ridge']
 
 The user-provided estimator pipeline is appended as a single final step. This
 means that ``tabular_pipeline`` can still decide which preprocessing steps to

diff --git a/skrub/_tabular_pipeline.py b/skrub/_tabular_pipeline.py
@@ -1,7 +1,7 @@
 from sklearn import ensemble
 from sklearn.base import BaseEstimator
 from sklearn.impute import SimpleImputer
-from sklearn.pipeline import make_pipeline
+from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.preprocessing import OrdinalEncoder
 
 from ._datetime_encoder import DatetimeEncoder
@@ -48,6 +48,7 @@ def tabular_pipeline(estimator, *, n_jobs=None):
     Parameters
     ----------
     estimator : {"regressor", "regression", "classifier", "classification"} or sklearn.base.BaseEstimator
+        or sklearn.pipeline.Pipeline
         The estimator to use as the final step in the pipeline. Based on the type of
         estimator, the previous preprocessing steps and their respective parameters are
         chosen. The possible values are:
@@ -59,6 +60,8 @@ def tabular_pipeline(estimator, *, n_jobs=None):
           :obj:`~sklearn.ensemble.HistGradientBoostingClassifier` is used as the final
           step;
         - a scikit-learn estimator: the provided estimator is used as the final step.
+        - a scikit-learn pipeline : the whole pipeline is kept and usual parameters are added depending
+          on the estimator in the last step of the pipeline.
 
     n_jobs : int, default=None
         Number of jobs to run in parallel in the :obj:`TableVectorizer` step. ``None``
@@ -224,14 +227,18 @@ def tabular_pipeline(estimator, *, n_jobs=None):
     """  # noqa: E501
     vectorizer = TableVectorizer(n_jobs=n_jobs)
     cat_feat_kwargs = {"categorical_features": "from_dtype"}
+    if isinstance(estimator, Pipeline):
+        estimator_ = estimator[-1]
+    else:
+        estimator_ = estimator
 
-    if isinstance(estimator, str):
-        if estimator in ("classifier", "classification"):
+    if isinstance(estimator_, str):
+        if estimator_ in ("classifier", "classification"):
             return tabular_pipeline(
                 ensemble.HistGradientBoostingClassifier(**cat_feat_kwargs),
                 n_jobs=n_jobs,
             )
-        if estimator in ("regressor", "regression"):
+        if estimator_ in ("regressor", "regression"):
             return tabular_pipeline(
                 ensemble.HistGradientBoostingRegressor(**cat_feat_kwargs),
                 n_jobs=n_jobs,
@@ -240,20 +247,20 @@ def tabular_pipeline(estimator, *, n_jobs=None):
             "If ``estimator`` is a string it should be 'regressor', 'regression',"
             " 'classifier' or 'classification'."
         )
-    if isinstance(estimator, type) and issubclass(estimator, BaseEstimator):
+    if isinstance(estimator_, type) and issubclass(estimator, BaseEstimator):
         raise TypeError(
             "tabular_pipeline expects a scikit-learn estimator as its first"
-            f" argument. Pass an instance of {estimator.__name__} rather than the class"
-            " itself."
+            f" argument. Pass an instance of {estimator_.__name__} rather than"
+            " the class itself."
         )
-    if not isinstance(estimator, BaseEstimator):
+    if not isinstance(estimator_, BaseEstimator):
         raise TypeError(
             "tabular_pipeline expects a scikit-learn estimator, 'regressor',"
             " or 'classifier' as its first argument."
         )
 
     if (
-        isinstance(estimator, _HGBT_CLASSES)
+        isinstance(estimator_, _HGBT_CLASSES)
         and getattr(estimator, "categorical_features", None) == "from_dtype"
     ):
         vectorizer.set_params(
@@ -270,10 +277,15 @@ def tabular_pipeline(estimator, *, n_jobs=None):
         )
     else:
         vectorizer.set_params(datetime=DatetimeEncoder(periodic_encoding="spline"))
+
     steps = [vectorizer]
-    if not get_tags(estimator).input_tags.allow_nan:
+    if not get_tags(estimator_).input_tags.allow_nan:
         steps.append(SimpleImputer(add_indicator=True))
-    if not isinstance(estimator, _TREE_ENSEMBLE_CLASSES):
+    if not isinstance(estimator_, _TREE_ENSEMBLE_CLASSES):
         steps.append(SquashingScaler(max_absolute_value=5))
-    steps.append(estimator)
+    if isinstance(estimator, Pipeline):
+        steps_pipeline = [sp for _, sp in estimator.steps]
+        steps.extend(steps_pipeline)
+    else:
+        steps.append(estimator_)
     return make_pipeline(*steps)
diff --git a/skrub/tests/test_tabular_pipeline.py b/skrub/tests/test_tabular_pipeline.py
@@ -1,7 +1,9 @@
 import pytest
 from sklearn import ensemble
+from sklearn.decomposition import PCA
 from sklearn.impute import SimpleImputer
-from sklearn.linear_model import Ridge
+from sklearn.linear_model import LogisticRegression, Ridge
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 
 from skrub import (
@@ -14,7 +16,13 @@
 
 
 @pytest.mark.parametrize(
-    "learner_kind", ["regressor", "regression", "classifier", "classification"]
+    "learner_kind",
+    [
+        "regressor",
+        "regression",
+        "classifier",
+        "classification",
+    ],
 )
 def test_default_pipeline(learner_kind):
     p = tabular_pipeline(learner_kind)
@@ -74,3 +82,13 @@ def test_from_dtype():
         ensemble.HistGradientBoostingRegressor(categorical_features="from_dtype")
     )
     assert isinstance(p.named_steps["tablevectorizer"].low_cardinality, ToCategorical)
+
+
+def test_skpipeline_learner():
-def test_skpipeline_learner():
+def test_estimator_is_a_pipeline():
-def test_skpipeline_learner():
+def test_estimator_is_a_pipeline():
+    original_learner = LogisticRegression()
+    sk_pipeline = Pipeline([("pca", PCA()), ("clf", original_learner)])
+    tab_pipeline = tabular_pipeline(sk_pipeline)
+    assert len([e for _, e in tab_pipeline.steps]) == 5
+    tv, imputer, scaler, pca, learner = (e for _, e in tab_pipeline.steps)
+    assert learner is original_learner
+    assert isinstance(pca, PCA)