diff --git a/CHANGES.rst b/CHANGES.rst index fce0bc259..804373ea7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -40,6 +40,9 @@ New Features Additionally, negative numbers indicated with parentheses can be converted to the regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela Gómez Jiménez `. +- TabularPipeline now uses the estimator when given a pipeline to determine the parameters of the TableVectorizer + :pr:`2152` by :user:`Khaoula Riad and Marine Michaut`. + Changes ------- diff --git a/doc/modules/default_wrangling/tabular_pipeline.rst b/doc/modules/default_wrangling/tabular_pipeline.rst index d2b3067eb..74045e4a2 100644 --- a/doc/modules/default_wrangling/tabular_pipeline.rst +++ b/doc/modules/default_wrangling/tabular_pipeline.rst @@ -139,7 +139,7 @@ the default table preprocessing: >>> model_pipeline = make_pipeline(PCA(n_components=20), Ridge()) >>> full_pipeline = tabular_pipeline(model_pipeline) >>> [name for name, _ in full_pipeline.steps] -['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pipeline'] +['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pca', 'ridge'] The user-provided estimator pipeline is appended as a single final step. This means that ``tabular_pipeline`` can still decide which preprocessing steps to diff --git a/skrub/_tabular_pipeline.py b/skrub/_tabular_pipeline.py index ac1c01244..6cd69078b 100644 --- a/skrub/_tabular_pipeline.py +++ b/skrub/_tabular_pipeline.py @@ -1,7 +1,7 @@ from sklearn import ensemble from sklearn.base import BaseEstimator from sklearn.impute import SimpleImputer -from sklearn.pipeline import make_pipeline +from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import OrdinalEncoder from ._datetime_encoder import DatetimeEncoder @@ -48,6 +48,7 @@ def tabular_pipeline(estimator, *, n_jobs=None): Parameters ---------- estimator : {"regressor", "regression", "classifier", "classification"} or sklearn.base.BaseEstimator + or sklearn.pipeline.Pipeline The estimator to use as the final step in the pipeline. Based on the type of estimator, the previous preprocessing steps and their respective parameters are chosen. The possible values are: @@ -59,6 +60,8 @@ def tabular_pipeline(estimator, *, n_jobs=None): :obj:`~sklearn.ensemble.HistGradientBoostingClassifier` is used as the final step; - a scikit-learn estimator: the provided estimator is used as the final step. + - a scikit-learn pipeline : the whole pipeline is kept and usual pre-processing by the TableReport + is added on top, depending on the estimator in the last step of the pipeline. n_jobs : int, default=None Number of jobs to run in parallel in the :obj:`TableVectorizer` step. ``None`` @@ -224,14 +227,18 @@ def tabular_pipeline(estimator, *, n_jobs=None): """ # noqa: E501 vectorizer = TableVectorizer(n_jobs=n_jobs) cat_feat_kwargs = {"categorical_features": "from_dtype"} + if isinstance(estimator, Pipeline): + estimator_ = estimator[-1] + else: + estimator_ = estimator - if isinstance(estimator, str): - if estimator in ("classifier", "classification"): + if isinstance(estimator_, str): + if estimator_ in ("classifier", "classification"): return tabular_pipeline( ensemble.HistGradientBoostingClassifier(**cat_feat_kwargs), n_jobs=n_jobs, ) - if estimator in ("regressor", "regression"): + if estimator_ in ("regressor", "regression"): return tabular_pipeline( ensemble.HistGradientBoostingRegressor(**cat_feat_kwargs), n_jobs=n_jobs, @@ -240,20 +247,20 @@ def tabular_pipeline(estimator, *, n_jobs=None): "If ``estimator`` is a string it should be 'regressor', 'regression'," " 'classifier' or 'classification'." ) - if isinstance(estimator, type) and issubclass(estimator, BaseEstimator): + if isinstance(estimator_, type) and issubclass(estimator, BaseEstimator): raise TypeError( "tabular_pipeline expects a scikit-learn estimator as its first" - f" argument. Pass an instance of {estimator.__name__} rather than the class" - " itself." + f" argument. Pass an instance of {estimator_.__name__} rather than" + " the class itself." ) - if not isinstance(estimator, BaseEstimator): + if not isinstance(estimator_, BaseEstimator): raise TypeError( "tabular_pipeline expects a scikit-learn estimator, 'regressor'," " or 'classifier' as its first argument." ) if ( - isinstance(estimator, _HGBT_CLASSES) + isinstance(estimator_, _HGBT_CLASSES) and getattr(estimator, "categorical_features", None) == "from_dtype" ): vectorizer.set_params( @@ -270,10 +277,15 @@ def tabular_pipeline(estimator, *, n_jobs=None): ) else: vectorizer.set_params(datetime=DatetimeEncoder(periodic_encoding="spline")) + steps = [vectorizer] - if not get_tags(estimator).input_tags.allow_nan: + if not get_tags(estimator_).input_tags.allow_nan: steps.append(SimpleImputer(add_indicator=True)) - if not isinstance(estimator, _TREE_ENSEMBLE_CLASSES): + if not isinstance(estimator_, _TREE_ENSEMBLE_CLASSES): steps.append(SquashingScaler(max_absolute_value=5)) - steps.append(estimator) + if isinstance(estimator, Pipeline): + steps_pipeline = [sp for _, sp in estimator.steps] + steps.extend(steps_pipeline) + else: + steps.append(estimator_) return make_pipeline(*steps) diff --git a/skrub/tests/test_tabular_pipeline.py b/skrub/tests/test_tabular_pipeline.py index 42dd17504..6731211c7 100644 --- a/skrub/tests/test_tabular_pipeline.py +++ b/skrub/tests/test_tabular_pipeline.py @@ -1,7 +1,9 @@ import pytest from sklearn import ensemble +from sklearn.decomposition import PCA from sklearn.impute import SimpleImputer -from sklearn.linear_model import Ridge +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from skrub import ( @@ -14,7 +16,13 @@ @pytest.mark.parametrize( - "learner_kind", ["regressor", "regression", "classifier", "classification"] + "learner_kind", + [ + "regressor", + "regression", + "classifier", + "classification", + ], ) def test_default_pipeline(learner_kind): p = tabular_pipeline(learner_kind) @@ -74,3 +82,13 @@ def test_from_dtype(): ensemble.HistGradientBoostingRegressor(categorical_features="from_dtype") ) assert isinstance(p.named_steps["tablevectorizer"].low_cardinality, ToCategorical) + + +def test_skpipeline_learner(): + original_learner = LogisticRegression() + sk_pipeline = Pipeline([("pca", PCA()), ("clf", original_learner)]) + tab_pipeline = tabular_pipeline(sk_pipeline) + assert len([element for _, element in tab_pipeline.steps]) == 5 + tv, imputer, scaler, pca, learner = (element for _, element in tab_pipeline.steps) + assert learner is original_learner + assert isinstance(pca, PCA)