-
Notifications
You must be signed in to change notification settings - Fork 260
feat: take pipeline's estimator into account to decide tabular vectorizer options in TabularPipeline #2152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
feat: take pipeline's estimator into account to decide tabular vectorizer options in TabularPipeline #2152
Changes from 6 commits
de8bd3e
2961a8a
e5406db
37fc386
26ad5e6
30a9019
0a78b9f
3f24e2b
5dc4fa5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,7 @@ | ||
| from sklearn import ensemble | ||
| from sklearn.base import BaseEstimator | ||
| from sklearn.impute import SimpleImputer | ||
| from sklearn.pipeline import make_pipeline | ||
| from sklearn.pipeline import Pipeline, make_pipeline | ||
| from sklearn.preprocessing import OrdinalEncoder | ||
|
|
||
| from ._datetime_encoder import DatetimeEncoder | ||
|
|
@@ -48,6 +48,7 @@ def tabular_pipeline(estimator, *, n_jobs=None): | |
| Parameters | ||
| ---------- | ||
| estimator : {"regressor", "regression", "classifier", "classification"} or sklearn.base.BaseEstimator | ||
| or sklearn.pipeline.Pipeline | ||
| The estimator to use as the final step in the pipeline. Based on the type of | ||
| estimator, the previous preprocessing steps and their respective parameters are | ||
| chosen. The possible values are: | ||
|
|
@@ -59,6 +60,8 @@ def tabular_pipeline(estimator, *, n_jobs=None): | |
| :obj:`~sklearn.ensemble.HistGradientBoostingClassifier` is used as the final | ||
| step; | ||
| - a scikit-learn estimator: the provided estimator is used as the final step. | ||
| - a scikit-learn pipeline : the whole pipeline is kept and usual parameters are added depending | ||
| on the estimator in the last step of the pipeline. | ||
|
khaoulariad marked this conversation as resolved.
Outdated
|
||
|
|
||
| n_jobs : int, default=None | ||
| Number of jobs to run in parallel in the :obj:`TableVectorizer` step. ``None`` | ||
|
|
@@ -224,14 +227,18 @@ def tabular_pipeline(estimator, *, n_jobs=None): | |
| """ # noqa: E501 | ||
| vectorizer = TableVectorizer(n_jobs=n_jobs) | ||
| cat_feat_kwargs = {"categorical_features": "from_dtype"} | ||
| if isinstance(estimator, Pipeline): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should come after the checks below -- it is unlikely someone will put the string "regressor" or a class at the end of a pipeline before passing it to tabular_pipeline also please avoid local variable names that differ by a single character like estimator and estimator_. here we can have something like if isinstance(estimator, Pipeline):
*user_transformers, estimator = estimator.steps
else:
user_transformers = ()
...
make_pipeline(TableVectorizer(), *user_transformers, estimator) |
||
| estimator_ = estimator[-1] | ||
| else: | ||
| estimator_ = estimator | ||
|
|
||
| if isinstance(estimator, str): | ||
| if estimator in ("classifier", "classification"): | ||
| if isinstance(estimator_, str): | ||
| if estimator_ in ("classifier", "classification"): | ||
| return tabular_pipeline( | ||
| ensemble.HistGradientBoostingClassifier(**cat_feat_kwargs), | ||
| n_jobs=n_jobs, | ||
| ) | ||
| if estimator in ("regressor", "regression"): | ||
| if estimator_ in ("regressor", "regression"): | ||
| return tabular_pipeline( | ||
| ensemble.HistGradientBoostingRegressor(**cat_feat_kwargs), | ||
| n_jobs=n_jobs, | ||
|
|
@@ -240,20 +247,20 @@ def tabular_pipeline(estimator, *, n_jobs=None): | |
| "If ``estimator`` is a string it should be 'regressor', 'regression'," | ||
| " 'classifier' or 'classification'." | ||
| ) | ||
| if isinstance(estimator, type) and issubclass(estimator, BaseEstimator): | ||
| if isinstance(estimator_, type) and issubclass(estimator, BaseEstimator): | ||
| raise TypeError( | ||
| "tabular_pipeline expects a scikit-learn estimator as its first" | ||
| f" argument. Pass an instance of {estimator.__name__} rather than the class" | ||
| " itself." | ||
| f" argument. Pass an instance of {estimator_.__name__} rather than" | ||
| " the class itself." | ||
| ) | ||
| if not isinstance(estimator, BaseEstimator): | ||
| if not isinstance(estimator_, BaseEstimator): | ||
| raise TypeError( | ||
| "tabular_pipeline expects a scikit-learn estimator, 'regressor'," | ||
| " or 'classifier' as its first argument." | ||
| ) | ||
|
|
||
| if ( | ||
| isinstance(estimator, _HGBT_CLASSES) | ||
| isinstance(estimator_, _HGBT_CLASSES) | ||
| and getattr(estimator, "categorical_features", None) == "from_dtype" | ||
| ): | ||
| vectorizer.set_params( | ||
|
|
@@ -270,10 +277,15 @@ def tabular_pipeline(estimator, *, n_jobs=None): | |
| ) | ||
| else: | ||
| vectorizer.set_params(datetime=DatetimeEncoder(periodic_encoding="spline")) | ||
|
|
||
| steps = [vectorizer] | ||
| if not get_tags(estimator).input_tags.allow_nan: | ||
| if not get_tags(estimator_).input_tags.allow_nan: | ||
| steps.append(SimpleImputer(add_indicator=True)) | ||
| if not isinstance(estimator, _TREE_ENSEMBLE_CLASSES): | ||
| if not isinstance(estimator_, _TREE_ENSEMBLE_CLASSES): | ||
| steps.append(SquashingScaler(max_absolute_value=5)) | ||
| steps.append(estimator) | ||
| if isinstance(estimator, Pipeline): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with the suggestion above we can do all the handling of pipelines in one place |
||
| steps_pipeline = [sp for _, sp in estimator.steps] | ||
| steps.extend(steps_pipeline) | ||
| else: | ||
| steps.append(estimator_) | ||
| return make_pipeline(*steps) | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,7 +1,9 @@ | ||||||
| import pytest | ||||||
| from sklearn import ensemble | ||||||
| from sklearn.decomposition import PCA | ||||||
| from sklearn.impute import SimpleImputer | ||||||
| from sklearn.linear_model import Ridge | ||||||
| from sklearn.linear_model import LogisticRegression, Ridge | ||||||
| from sklearn.pipeline import Pipeline | ||||||
| from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder | ||||||
|
|
||||||
| from skrub import ( | ||||||
|
|
@@ -14,7 +16,13 @@ | |||||
|
|
||||||
|
|
||||||
| @pytest.mark.parametrize( | ||||||
| "learner_kind", ["regressor", "regression", "classifier", "classification"] | ||||||
| "learner_kind", | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not a big deal at all but in general please try to avoid changes that are unrelated to your Pull Request to make it easier to review and avoid cluttering the git history |
||||||
| [ | ||||||
| "regressor", | ||||||
| "regression", | ||||||
| "classifier", | ||||||
| "classification", | ||||||
| ], | ||||||
| ) | ||||||
| def test_default_pipeline(learner_kind): | ||||||
| p = tabular_pipeline(learner_kind) | ||||||
|
|
@@ -74,3 +82,13 @@ def test_from_dtype(): | |||||
| ensemble.HistGradientBoostingRegressor(categorical_features="from_dtype") | ||||||
| ) | ||||||
| assert isinstance(p.named_steps["tablevectorizer"].low_cardinality, ToCategorical) | ||||||
|
|
||||||
|
|
||||||
| def test_skpipeline_learner(): | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| original_learner = LogisticRegression() | ||||||
| sk_pipeline = Pipeline([("pca", PCA()), ("clf", original_learner)]) | ||||||
| tab_pipeline = tabular_pipeline(sk_pipeline) | ||||||
| assert len([e for _, e in tab_pipeline.steps]) == 5 | ||||||
| tv, imputer, scaler, pca, learner = (e for _, e in tab_pipeline.steps) | ||||||
|
khaoulariad marked this conversation as resolved.
Outdated
|
||||||
| assert learner is original_learner | ||||||
| assert isinstance(pca, PCA) | ||||||
Uh oh!
There was an error while loading. Please reload this page.