Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ New Features
Additionally, negative numbers indicated with parentheses can be converted to the
regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela
Gómez Jiménez <gabrielapgomezji>`.
- Added usual parameters when scikit-learn pipeline is passed to tabular_pipeline
Comment thread
khaoulariad marked this conversation as resolved.
Outdated
:pr:`2152` by :user:`Khaoula Riad and Marine Michaut`.


Changes
-------
Expand Down
2 changes: 1 addition & 1 deletion doc/modules/default_wrangling/tabular_pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ the default table preprocessing:
>>> model_pipeline = make_pipeline(PCA(n_components=20), Ridge())
>>> full_pipeline = tabular_pipeline(model_pipeline)
>>> [name for name, _ in full_pipeline.steps]
['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pipeline']
['tablevectorizer', 'simpleimputer', 'squashingscaler', 'pca', 'ridge']

The user-provided estimator pipeline is appended as a single final step. This
means that ``tabular_pipeline`` can still decide which preprocessing steps to
Expand Down
36 changes: 24 additions & 12 deletions skrub/_tabular_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from sklearn import ensemble
from sklearn.base import BaseEstimator
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from ._datetime_encoder import DatetimeEncoder
Expand Down Expand Up @@ -48,6 +48,7 @@ def tabular_pipeline(estimator, *, n_jobs=None):
Parameters
----------
estimator : {"regressor", "regression", "classifier", "classification"} or sklearn.base.BaseEstimator
or sklearn.pipeline.Pipeline
The estimator to use as the final step in the pipeline. Based on the type of
estimator, the previous preprocessing steps and their respective parameters are
chosen. The possible values are:
Expand All @@ -59,6 +60,8 @@ def tabular_pipeline(estimator, *, n_jobs=None):
:obj:`~sklearn.ensemble.HistGradientBoostingClassifier` is used as the final
step;
- a scikit-learn estimator: the provided estimator is used as the final step.
- a scikit-learn pipeline : the whole pipeline is kept and usual parameters are added depending
on the estimator in the last step of the pipeline.
Comment thread
khaoulariad marked this conversation as resolved.
Outdated

n_jobs : int, default=None
Number of jobs to run in parallel in the :obj:`TableVectorizer` step. ``None``
Expand Down Expand Up @@ -224,14 +227,18 @@ def tabular_pipeline(estimator, *, n_jobs=None):
""" # noqa: E501
vectorizer = TableVectorizer(n_jobs=n_jobs)
cat_feat_kwargs = {"categorical_features": "from_dtype"}
if isinstance(estimator, Pipeline):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should come after the checks below -- it is unlikely someone will put the string "regressor" or a class at the end of a pipeline before passing it to tabular_pipeline

also please avoid local variable names that differ by a single character like estimator and estimator_. here we can have something like

if isinstance(estimator, Pipeline):
    *user_transformers, estimator = estimator.steps
else:
    user_transformers = ()
...
make_pipeline(TableVectorizer(), *user_transformers, estimator)

estimator_ = estimator[-1]
else:
estimator_ = estimator

if isinstance(estimator, str):
if estimator in ("classifier", "classification"):
if isinstance(estimator_, str):
if estimator_ in ("classifier", "classification"):
return tabular_pipeline(
ensemble.HistGradientBoostingClassifier(**cat_feat_kwargs),
n_jobs=n_jobs,
)
if estimator in ("regressor", "regression"):
if estimator_ in ("regressor", "regression"):
return tabular_pipeline(
ensemble.HistGradientBoostingRegressor(**cat_feat_kwargs),
n_jobs=n_jobs,
Expand All @@ -240,20 +247,20 @@ def tabular_pipeline(estimator, *, n_jobs=None):
"If ``estimator`` is a string it should be 'regressor', 'regression',"
" 'classifier' or 'classification'."
)
if isinstance(estimator, type) and issubclass(estimator, BaseEstimator):
if isinstance(estimator_, type) and issubclass(estimator, BaseEstimator):
raise TypeError(
"tabular_pipeline expects a scikit-learn estimator as its first"
f" argument. Pass an instance of {estimator.__name__} rather than the class"
" itself."
f" argument. Pass an instance of {estimator_.__name__} rather than"
" the class itself."
)
if not isinstance(estimator, BaseEstimator):
if not isinstance(estimator_, BaseEstimator):
raise TypeError(
"tabular_pipeline expects a scikit-learn estimator, 'regressor',"
" or 'classifier' as its first argument."
)

if (
isinstance(estimator, _HGBT_CLASSES)
isinstance(estimator_, _HGBT_CLASSES)
and getattr(estimator, "categorical_features", None) == "from_dtype"
):
vectorizer.set_params(
Expand All @@ -270,10 +277,15 @@ def tabular_pipeline(estimator, *, n_jobs=None):
)
else:
vectorizer.set_params(datetime=DatetimeEncoder(periodic_encoding="spline"))

steps = [vectorizer]
if not get_tags(estimator).input_tags.allow_nan:
if not get_tags(estimator_).input_tags.allow_nan:
steps.append(SimpleImputer(add_indicator=True))
if not isinstance(estimator, _TREE_ENSEMBLE_CLASSES):
if not isinstance(estimator_, _TREE_ENSEMBLE_CLASSES):
steps.append(SquashingScaler(max_absolute_value=5))
steps.append(estimator)
if isinstance(estimator, Pipeline):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with the suggestion above we can do all the handling of pipelines in one place

steps_pipeline = [sp for _, sp in estimator.steps]
steps.extend(steps_pipeline)
else:
steps.append(estimator_)
return make_pipeline(*steps)
22 changes: 20 additions & 2 deletions skrub/tests/test_tabular_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pytest
from sklearn import ensemble
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from skrub import (
Expand All @@ -14,7 +16,13 @@


@pytest.mark.parametrize(
"learner_kind", ["regressor", "regression", "classifier", "classification"]
"learner_kind",

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not a big deal at all but in general please try to avoid changes that are unrelated to your Pull Request to make it easier to review and avoid cluttering the git history

[
"regressor",
"regression",
"classifier",
"classification",
],
)
def test_default_pipeline(learner_kind):
p = tabular_pipeline(learner_kind)
Expand Down Expand Up @@ -74,3 +82,13 @@ def test_from_dtype():
ensemble.HistGradientBoostingRegressor(categorical_features="from_dtype")
)
assert isinstance(p.named_steps["tablevectorizer"].low_cardinality, ToCategorical)


def test_skpipeline_learner():

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def test_skpipeline_learner():
def test_estimator_is_a_pipeline():

original_learner = LogisticRegression()
sk_pipeline = Pipeline([("pca", PCA()), ("clf", original_learner)])
tab_pipeline = tabular_pipeline(sk_pipeline)
assert len([e for _, e in tab_pipeline.steps]) == 5
tv, imputer, scaler, pca, learner = (e for _, e in tab_pipeline.steps)
Comment thread
khaoulariad marked this conversation as resolved.
Outdated
assert learner is original_learner
assert isinstance(pca, PCA)
Loading