diff --git a/feature_engine/encoding/one_hot.py b/feature_engine/encoding/one_hot.py index e94432a3d..9ab36dc93 100644 --- a/feature_engine/encoding/one_hot.py +++ b/feature_engine/encoding/one_hot.py @@ -1,6 +1,7 @@ # Authors: Soledad Galli # License: BSD 3 clause +import warnings from typing import List, Optional, Union import numpy as np @@ -94,6 +95,19 @@ class OneHotEncoder(CategoricalMethodsMixin, CategoricalInitMixin): to `True`, will ensure that for every binary variable in the dataset, only 1 dummy is created. + drop: str, default=None + Controls which category to drop when creating k-1 dummy variables. Only used + if `top_categories` is None. If `drop` is not None and `drop_last` is also + True, a `FutureWarning` is raised and `drop` takes precedence. + + - ``None``: No category is dropped (k dummies). Equivalent to + ``drop_last=False``. + - ``'last'``: Drops the last category in alphabetical order. + - ``'first'``: Drops the first category in alphabetical order. + - ``'most_frequent'``: Drops the most frequent category found during ``fit()``. + If there is a tie, a ``UserWarning`` is raised and the first + category alphabetically among the tied categories is dropped. + {variables} {ignore_format} @@ -162,6 +176,7 @@ def __init__( top_categories: Optional[int] = None, drop_last: bool = False, drop_last_binary: bool = False, + drop: Optional[str] = None, variables: Union[None, int, str, List[Union[str, int]]] = None, ignore_format: bool = False, ) -> None: @@ -185,10 +200,26 @@ def __init__( f"Got {drop_last_binary} instead." ) + if drop is not None and drop not in ("last", "first", "most_frequent"): + raise ValueError( + "drop takes only values None, 'last', 'first', or " + f"'most_frequent'. Got {drop} instead." + ) + + if drop is not None and drop_last is True: + warnings.warn( + "Both `drop_last` and `drop` were set. `drop_last` is deprecated " + "when used together with `drop`. `drop` will take precedence. " + "In future versions, `drop_last` will be removed.", + FutureWarning, + stacklevel=2, + ) + super().__init__(variables, ignore_format) self.top_categories = top_categories self.drop_last = drop_last self.drop_last_binary = drop_last_binary + self.drop = drop def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ @@ -230,8 +261,41 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): else: category_ls = list(X[var].unique()) - # return k-1 dummies - if self.drop_last: + if self.drop is not None: + sorted_cats = sorted(category_ls) + + if self.drop == "last": + self.encoder_dict_[var] = sorted_cats[:-1] + + elif self.drop == "first": + self.encoder_dict_[var] = sorted_cats[1:] + + elif self.drop == "most_frequent": + freq = X[var].value_counts() + max_freq = freq.iloc[0] + most_frequent_cats = freq[ + freq == max_freq + ].index.tolist() + + if len(most_frequent_cats) > 1: + cat_to_drop = sorted(most_frequent_cats)[0] + warnings.warn( + f"Variable '{var}': multiple categories " + f"share the highest frequency ({max_freq}). " + f"Dropping '{cat_to_drop}' (first " + f"alphabetically among ties).", + UserWarning, + stacklevel=2, + ) + else: + cat_to_drop = most_frequent_cats[0] + + self.encoder_dict_[var] = [ + c for c in category_ls if c != cat_to_drop + ] + + # Legacy path: drop_last (backward compatible) + elif self.drop_last: self.encoder_dict_[var] = category_ls[:-1] # return k dummies diff --git a/feature_engine/selection/probe_feature_selection.py b/feature_engine/selection/probe_feature_selection.py index ec112b3e4..520d3ecd8 100644 --- a/feature_engine/selection/probe_feature_selection.py +++ b/feature_engine/selection/probe_feature_selection.py @@ -87,6 +87,13 @@ class ProbeFeatureSelection(BaseSelector): {variables} + variables_discrete: list, default=None + A list of discrete variables. If None, all variables are treated equally and + their feature importance is compared to the feature importance of all probe + features. If passed, the discrete variables will be compared to the discrete + probe features, and the continuous variables will be compared to the + continuous probe features. + collective: bool, default=True Whether the feature importance should be derived from an estimator trained on the entire dataset (True), or trained using individual features (False). @@ -140,6 +147,9 @@ class ProbeFeatureSelection(BaseSelector): {variables_} + variables_discrete_: + A list of discrete variables to be compared with discrete probes. + {feature_names_in_} {n_features_in_} @@ -183,6 +193,7 @@ def __init__( self, estimator, variables: Variables = None, + variables_discrete: Variables = None, collective: bool = True, scoring: str = "roc_auc", n_probes: int = 1, @@ -244,6 +255,7 @@ def __init__( super().__init__(confirm_variables) self.estimator = estimator self.variables = variables + self.variables_discrete = variables_discrete self.collective = collective self.scoring = scoring self.distribution = distribution @@ -272,6 +284,19 @@ def fit(self, X: pd.DataFrame, y: pd.Series): X, self.variables, self.confirm_variables ) + if self.variables_discrete is not None: + self.variables_discrete_ = _select_numerical_variables( + X, self.variables_discrete, self.confirm_variables + ) + for var in self.variables_discrete_: + if var not in self.variables_: + raise ValueError( + f"Variable {var} is present in variables_discrete " + f"but not in variables." + ) + else: + self.variables_discrete_ = None + # save input features self._get_feature_names_in(X) @@ -360,39 +385,58 @@ def _get_features_to_drop(self): Identify the variables that have a lower feature importance than the average feature importance of all the probe features. """ + features_to_drop = [] + + if self.variables_discrete_ is None: + threshold = self._get_threshold(self.probe_features_.columns) + for var in self.variables_: + if self.feature_importances_[var] < threshold: + features_to_drop.append(var) + else: + discrete_probes = [ + c + for c in self.probe_features_.columns + if "binary" in c or "discrete_uniform" in c or "poisson" in c + ] + continuous_probes = [ + c + for c in self.probe_features_.columns + if "gaussian" in c or ("uniform" in c and "discrete_uniform" not in c) + ] + + threshold_discrete = self._get_threshold(discrete_probes) + threshold_continuous = self._get_threshold(continuous_probes) + + for var in self.variables_: + if var in self.variables_discrete_: + if self.feature_importances_[var] < threshold_discrete: + features_to_drop.append(var) + else: + if self.feature_importances_[var] < threshold_continuous: + features_to_drop.append(var) + + return features_to_drop - # if more than 1 probe feature, calculate threshold based on - # probe feature importance. - if self.probe_features_.shape[1] > 1: + def _get_threshold(self, probes): + if not len(probes): + raise ValueError( + "The selected distribution does not generate the required probes. " + "For example, if you set variables_discrete, you need to generate " + "both continuous and discrete probes." + ) + if len(probes) > 1: if self.threshold == "mean": - threshold = self.feature_importances_[ - self.probe_features_.columns - ].values.mean() + threshold = self.feature_importances_[probes].values.mean() elif self.threshold == "max": - threshold = self.feature_importances_[ - self.probe_features_.columns - ].values.max() + threshold = self.feature_importances_[probes].values.max() else: threshold = ( - self.feature_importances_[ - self.probe_features_.columns - ].values.mean() - + 3 - * self.feature_importances_[ - self.probe_features_.columns - ].values.std() + self.feature_importances_[probes].values.mean() + + 3 * self.feature_importances_[probes].values.std() ) - else: - threshold = self.feature_importances_[self.probe_features_.columns].values - - features_to_drop = [] - - for var in self.variables_: - if self.feature_importances_[var] < threshold: - features_to_drop.append(var) - - return features_to_drop + threshold = self.feature_importances_[probes].values[0] + return threshold def _more_tags(self): tags_dict = _return_tags() diff --git a/tests/test_encoding/test_onehot_encoder.py b/tests/test_encoding/test_onehot_encoder.py index aca3448be..ac2c8ad0b 100644 --- a/tests/test_encoding/test_onehot_encoder.py +++ b/tests/test_encoding/test_onehot_encoder.py @@ -534,3 +534,206 @@ def test_inverse_transform_raises_not_implemented_error(df_enc_binary): enc = OneHotEncoder().fit(df_enc_binary) with pytest.raises(NotImplementedError): enc.inverse_transform(df_enc_binary) + + +# =========================================================================== +# Tests for the new `drop` parameter (Issue #913) +# =========================================================================== + + +@pytest.fixture(scope="module") +def df_drop(): + """DataFrame with known categories for testing drop strategies.""" + df = pd.DataFrame( + { + "x1": ["c", "a", "b", "a", "c", "b", "a"], + "x2": ["z", "y", "z", "x", "y", "z", "x"], + "num": [1, 2, 3, 4, 5, 6, 7], + } + ) + return df + + +def test_drop_last_alphabetically(df_drop): + """drop='last' should drop the last category in sorted order.""" + encoder = OneHotEncoder(drop="last") + encoder.fit(df_drop) + + # x1 categories sorted: ['a', 'b', 'c'] -> drop 'c' + assert encoder.encoder_dict_["x1"] == ["a", "b"] + # x2 categories sorted: ['x', 'y', 'z'] -> drop 'z' + assert encoder.encoder_dict_["x2"] == ["x", "y"] + + X = encoder.transform(df_drop) + assert "x1_c" not in X.columns + assert "x2_z" not in X.columns + assert "x1_a" in X.columns + assert "x1_b" in X.columns + assert "x2_x" in X.columns + assert "x2_y" in X.columns + + +def test_drop_first_alphabetically(df_drop): + """drop='first' should drop the first category in sorted order.""" + encoder = OneHotEncoder(drop="first") + encoder.fit(df_drop) + + # x1 categories sorted: ['a', 'b', 'c'] -> drop 'a' + assert encoder.encoder_dict_["x1"] == ["b", "c"] + # x2 categories sorted: ['x', 'y', 'z'] -> drop 'x' + assert encoder.encoder_dict_["x2"] == ["y", "z"] + + X = encoder.transform(df_drop) + assert "x1_a" not in X.columns + assert "x2_x" not in X.columns + assert "x1_b" in X.columns + assert "x1_c" in X.columns + assert "x2_y" in X.columns + assert "x2_z" in X.columns + + +def test_drop_most_frequent(): + """drop='most_frequent' should drop the most common category.""" + df = pd.DataFrame( + { + "x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3, + } + ) + + encoder = OneHotEncoder(drop="most_frequent") + encoder.fit(df) + + # 'a' is most frequent (10 times) -> drop 'a' + assert "a" not in encoder.encoder_dict_["x1"] + assert "b" in encoder.encoder_dict_["x1"] + assert "c" in encoder.encoder_dict_["x1"] + + X = encoder.transform(df) + assert "x1_a" not in X.columns + assert "x1_b" in X.columns + assert "x1_c" in X.columns + + +def test_drop_most_frequent_with_tie(): + """When multiple categories tie for most frequent, warn and drop first alpha.""" + df = pd.DataFrame( + { + "x1": ["c"] * 5 + ["a"] * 5 + ["b"] * 3, + } + ) + + with pytest.warns(UserWarning, match="multiple categories share the highest"): + encoder = OneHotEncoder(drop="most_frequent") + encoder.fit(df) + + # 'a' and 'c' both have frequency 5 — drop 'a' (first alphabetically) + assert "a" not in encoder.encoder_dict_["x1"] + assert "b" in encoder.encoder_dict_["x1"] + assert "c" in encoder.encoder_dict_["x1"] + + +def test_drop_ignored_when_top_categories_set(): + """top_categories should take precedence over drop.""" + df = pd.DataFrame( + { + "x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3 + ["d"] * 1, + } + ) + + encoder = OneHotEncoder(top_categories=2, drop="first") + encoder.fit(df) + + # top_categories=2 should pick the 2 most frequent: ['a', 'b'] + assert encoder.encoder_dict_["x1"] == ["a", "b"] + + +def test_drop_overrides_drop_last(): + """When both drop and drop_last are set, drop wins and FutureWarning is raised.""" + df = pd.DataFrame( + { + "x1": ["c", "a", "b", "a", "c", "b", "a"], + } + ) + + with pytest.warns(FutureWarning, match="drop_last.*deprecated"): + encoder = OneHotEncoder(drop_last=True, drop="first") + + encoder.fit(df) + + # drop="first" should drop 'a' (sorted: ['a', 'b', 'c']) + assert encoder.encoder_dict_["x1"] == ["b", "c"] + + +def test_drop_with_drop_last_binary(): + """drop and drop_last_binary should work together correctly.""" + df = pd.DataFrame( + { + "x1": ["a"] * 10 + ["b"] * 5 + ["c"] * 3, + "x2": ["yes"] * 10 + ["no"] * 8, # binary variable + } + ) + + encoder = OneHotEncoder(drop="first", drop_last_binary=True) + encoder.fit(df) + + # x1: sorted ['a', 'b', 'c'] -> drop 'a' + assert encoder.encoder_dict_["x1"] == ["b", "c"] + + # x2: binary -> drop_last_binary overrides to keep only the first unique + assert len(encoder.encoder_dict_["x2"]) == 1 + + +@pytest.mark.parametrize( + "drop_value", ["empanada", "middle", 123, True, ["last"]] +) +def test_error_if_drop_not_valid_string(drop_value): + """Invalid drop values should raise ValueError.""" + with pytest.raises(ValueError, match="drop takes only values"): + OneHotEncoder(drop=drop_value) + + +def test_get_feature_names_out_with_drop(df_enc_binary): + """get_feature_names_out should reflect the dropped category.""" + original_features = ["var_num"] + input_features = df_enc_binary.columns + + # drop="first": sorted cats for var_A are ['A','B','C'] -> drop 'A' + tr = OneHotEncoder(drop="first") + tr.fit(df_enc_binary) + + out = [ + "var_A_B", + "var_A_C", + "var_B_B", + "var_B_C", + "var_C_UHU", + "var_D_OHO", + ] + feat_out = original_features + out + assert tr.get_feature_names_out(input_features=None) == feat_out + assert tr.get_feature_names_out(input_features=input_features) == feat_out + + +def test_drop_none_produces_k_dummies(df_drop): + """drop=None (default) should produce k dummies, same as drop_last=False.""" + encoder = OneHotEncoder(drop=None, drop_last=False) + encoder.fit(df_drop) + + # x1 has 3 unique categories -> 3 dummies + assert len(encoder.encoder_dict_["x1"]) == 3 + # x2 has 3 unique categories -> 3 dummies + assert len(encoder.encoder_dict_["x2"]) == 3 + + +def test_drop_last_backward_compatible(df_drop): + """Existing drop_last=True without drop should behave exactly as before.""" + encoder = OneHotEncoder(drop_last=True) + encoder.fit(df_drop) + + # Original behavior: category_ls = list(unique()), drop last element + # This preserves insertion order, NOT sorted order + x1_unique = list(df_drop["x1"].unique()) + assert encoder.encoder_dict_["x1"] == x1_unique[:-1] + + x2_unique = list(df_drop["x2"].unique()) + assert encoder.encoder_dict_["x2"] == x2_unique[:-1] diff --git a/tests/test_selection/test_probe_feature_selection.py b/tests/test_selection/test_probe_feature_selection.py index 58d489122..ab2b9b32e 100644 --- a/tests/test_selection/test_probe_feature_selection.py +++ b/tests/test_selection/test_probe_feature_selection.py @@ -8,22 +8,30 @@ from feature_engine.selection import ProbeFeatureSelection _input_params = [ - (RandomForestClassifier(), "precision", "all", 3, 3, 6, 4), - (Lasso(), "neg_mean_squared_error", "binary", 7, 7, 4, 100), - (LogisticRegression(), "roc_auc", "normal", 5, 5, 2, 73), - (DecisionTreeRegressor(), "r2", "uniform", 4, 4, 10, 84), - (DecisionTreeRegressor(), "r2", "discrete_uniform", 4, 4, 10, 84), - (DecisionTreeRegressor(), "r2", "poisson", 4, 4, 10, 84), - (RandomForestClassifier(), "precision", ["binary", "uniform"], 3, 3, 6, 4), + (RandomForestClassifier(), "precision", "all", 3, 3, 6, 4, None), + (Lasso(), "neg_mean_squared_error", "binary", 7, 7, 4, 100, ["var1"]), + (LogisticRegression(), "roc_auc", "normal", 5, 5, 2, 73, None), + (DecisionTreeRegressor(), "r2", "uniform", 4, 4, 10, 84, ["var2"]), + (DecisionTreeRegressor(), "r2", "discrete_uniform", 4, 4, 10, 84, None), + (DecisionTreeRegressor(), "r2", "poisson", 4, 4, 10, 84, ["var1", "var2"]), + (RandomForestClassifier(), "precision", ["binary", "uniform"], 3, 3, 6, 4, None), ] @pytest.mark.parametrize( - "_estimator, _scoring, _distribution, _n_cat, _cv, _n_probes, _random_state", + "_estimator, _scoring, _distribution, _n_cat, _cv, _n_probes, " + "_random_state, _variables_discrete", _input_params, ) def test_input_params_assignment( - _estimator, _scoring, _distribution, _n_cat, _cv, _n_probes, _random_state + _estimator, + _scoring, + _distribution, + _n_cat, + _cv, + _n_probes, + _random_state, + _variables_discrete, ): sel = ProbeFeatureSelection( estimator=_estimator, @@ -33,6 +41,7 @@ def test_input_params_assignment( cv=_cv, n_probes=_n_probes, random_state=_random_state, + variables_discrete=_variables_discrete, ) assert sel.estimator == _estimator @@ -42,6 +51,7 @@ def test_input_params_assignment( assert sel.cv == _cv assert sel.n_probes == _n_probes assert sel.random_state == _random_state + assert sel.variables_discrete == _variables_discrete @pytest.mark.parametrize("collective", [True, False]) @@ -349,6 +359,7 @@ def test_get_features_to_drop_with_one_probe(thresh): ) sel.probe_features_ = pd.DataFrame({"probe": [1, 1, 1, 1, 1]}) sel.variables_ = ["var1", "var2", "var3"] + sel.variables_discrete_ = None assert sel._get_features_to_drop() == ["var3"] @@ -374,9 +385,78 @@ def test_get_features_to_drop_with_many_probes(thresh, vars_to_drop): {"probe1": [1, 1, 1, 1, 1], "probe2": [1, 1, 1, 1, 1]} ) sel.variables_ = ["var1", "var2", "var3", "var4"] + sel.variables_discrete_ = None assert sel._get_features_to_drop() == vars_to_drop +def test_variables_discrete_raises_error_when_not_in_variables(df_test): + X, y = df_test + + sel = ProbeFeatureSelection( + estimator=DecisionTreeClassifier(), + variables=["var_0", "var_1"], + variables_discrete=["var_2"], + ) + msg = "Variable var_2 is present in variables_discrete but not in variables." + with pytest.raises(ValueError, match=msg): + sel.fit(X, y) + + +def test_variables_discrete_raises_error_when_no_probes_generated(df_test): + X, y = df_test + + sel = ProbeFeatureSelection( + estimator=DecisionTreeClassifier(), + variables=["var_0", "var_1"], + variables_discrete=["var_1"], + distribution="normal", # only generates continuous probes + ) + msg = "The selected distribution does not generate the required probes.*" + with pytest.raises(ValueError, match=msg): + sel.fit(X, y) + + sel = ProbeFeatureSelection( + estimator=DecisionTreeClassifier(), + variables=["var_0", "var_1"], + variables_discrete=["var_1"], + distribution="binary", # only generates discrete probes + ) + with pytest.raises(ValueError, match=msg): + sel.fit(X, y) + + +def test_variables_discrete_functionality(): + sel = ProbeFeatureSelection( + estimator=LogisticRegression(), + n_probes=2, + ) + sel.feature_importances_ = pd.Series( + [11, 20, 9.9, 8.7, 10, 8, 9, 7], + index=[ + "var1", + "var2", + "var3", + "var4", + "gaussian_probe_0", + "gaussian_probe_1", + "binary_probe_0", + "binary_probe_1", + ], + ) + sel.probe_features_ = pd.DataFrame( + { + "gaussian_probe_0": [1], + "gaussian_probe_1": [1], + "binary_probe_0": [1], + "binary_probe_1": [1], + } + ) + sel.variables_ = ["var1", "var2", "var3", "var4"] + sel.variables_discrete_ = ["var3"] + + assert sel._get_features_to_drop() == ["var4"] + + def test_cv_generator(df_test): X, y = df_test cv = StratifiedKFold(n_splits=3)