Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 66 additions & 2 deletions feature_engine/encoding/one_hot.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import warnings
from typing import List, Optional, Union

import numpy as np
Expand Down Expand Up @@ -94,6 +95,19 @@ class OneHotEncoder(CategoricalMethodsMixin, CategoricalInitMixin):
to `True`, will ensure that for every binary variable in the dataset, only 1
dummy is created.

drop: str, default=None
Controls which category to drop when creating k-1 dummy variables. Only used
if `top_categories` is None. If `drop` is not None and `drop_last` is also
True, a `FutureWarning` is raised and `drop` takes precedence.

- ``None``: No category is dropped (k dummies). Equivalent to
``drop_last=False``.
- ``'last'``: Drops the last category in alphabetical order.
- ``'first'``: Drops the first category in alphabetical order.
- ``'most_frequent'``: Drops the most frequent category found during ``fit()``.
If there is a tie, a ``UserWarning`` is raised and the first
category alphabetically among the tied categories is dropped.

{variables}

{ignore_format}
Expand Down Expand Up @@ -162,6 +176,7 @@ def __init__(
top_categories: Optional[int] = None,
drop_last: bool = False,
drop_last_binary: bool = False,
drop: Optional[str] = None,
variables: Union[None, int, str, List[Union[str, int]]] = None,
ignore_format: bool = False,
) -> None:
Expand All @@ -185,10 +200,26 @@ def __init__(
f"Got {drop_last_binary} instead."
)

if drop is not None and drop not in ("last", "first", "most_frequent"):
raise ValueError(
"drop takes only values None, 'last', 'first', or "
f"'most_frequent'. Got {drop} instead."
)

if drop is not None and drop_last is True:
warnings.warn(
"Both `drop_last` and `drop` were set. `drop_last` is deprecated "
"when used together with `drop`. `drop` will take precedence. "
"In future versions, `drop_last` will be removed.",
FutureWarning,
stacklevel=2,
)

super().__init__(variables, ignore_format)
self.top_categories = top_categories
self.drop_last = drop_last
self.drop_last_binary = drop_last_binary
self.drop = drop

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Expand Down Expand Up @@ -230,8 +261,41 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
else:
category_ls = list(X[var].unique())

# return k-1 dummies
if self.drop_last:
if self.drop is not None:
sorted_cats = sorted(category_ls)

if self.drop == "last":
self.encoder_dict_[var] = sorted_cats[:-1]

elif self.drop == "first":
self.encoder_dict_[var] = sorted_cats[1:]

elif self.drop == "most_frequent":
freq = X[var].value_counts()
max_freq = freq.iloc[0]
most_frequent_cats = freq[
freq == max_freq
].index.tolist()

if len(most_frequent_cats) > 1:
cat_to_drop = sorted(most_frequent_cats)[0]
warnings.warn(
f"Variable '{var}': multiple categories "
f"share the highest frequency ({max_freq}). "
f"Dropping '{cat_to_drop}' (first "
f"alphabetically among ties).",
UserWarning,
stacklevel=2,
)
else:
cat_to_drop = most_frequent_cats[0]

self.encoder_dict_[var] = [
c for c in category_ls if c != cat_to_drop
]

# Legacy path: drop_last (backward compatible)
elif self.drop_last:
self.encoder_dict_[var] = category_ls[:-1]

# return k dummies
Expand Down
96 changes: 70 additions & 26 deletions feature_engine/selection/probe_feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ class ProbeFeatureSelection(BaseSelector):

{variables}

variables_discrete: list, default=None
A list of discrete variables. If None, all variables are treated equally and
their feature importance is compared to the feature importance of all probe
features. If passed, the discrete variables will be compared to the discrete
probe features, and the continuous variables will be compared to the
continuous probe features.

collective: bool, default=True
Whether the feature importance should be derived from an estimator trained on
the entire dataset (True), or trained using individual features (False).
Expand Down Expand Up @@ -140,6 +147,9 @@ class ProbeFeatureSelection(BaseSelector):

{variables_}

variables_discrete_:
A list of discrete variables to be compared with discrete probes.

{feature_names_in_}

{n_features_in_}
Expand Down Expand Up @@ -183,6 +193,7 @@ def __init__(
self,
estimator,
variables: Variables = None,
variables_discrete: Variables = None,
collective: bool = True,
scoring: str = "roc_auc",
n_probes: int = 1,
Expand Down Expand Up @@ -244,6 +255,7 @@ def __init__(
super().__init__(confirm_variables)
self.estimator = estimator
self.variables = variables
self.variables_discrete = variables_discrete
self.collective = collective
self.scoring = scoring
self.distribution = distribution
Expand Down Expand Up @@ -272,6 +284,19 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
X, self.variables, self.confirm_variables
)

if self.variables_discrete is not None:
self.variables_discrete_ = _select_numerical_variables(
X, self.variables_discrete, self.confirm_variables
)
for var in self.variables_discrete_:
if var not in self.variables_:
raise ValueError(
f"Variable {var} is present in variables_discrete "
f"but not in variables."
)
else:
self.variables_discrete_ = None

# save input features
self._get_feature_names_in(X)

Expand Down Expand Up @@ -360,39 +385,58 @@ def _get_features_to_drop(self):
Identify the variables that have a lower feature importance than the average
feature importance of all the probe features.
"""
features_to_drop = []

if self.variables_discrete_ is None:
threshold = self._get_threshold(self.probe_features_.columns)
for var in self.variables_:
if self.feature_importances_[var] < threshold:
features_to_drop.append(var)
else:
discrete_probes = [
c
for c in self.probe_features_.columns
if "binary" in c or "discrete_uniform" in c or "poisson" in c
]
continuous_probes = [
c
for c in self.probe_features_.columns
if "gaussian" in c or ("uniform" in c and "discrete_uniform" not in c)
]

threshold_discrete = self._get_threshold(discrete_probes)
threshold_continuous = self._get_threshold(continuous_probes)

for var in self.variables_:
if var in self.variables_discrete_:
if self.feature_importances_[var] < threshold_discrete:
features_to_drop.append(var)
else:
if self.feature_importances_[var] < threshold_continuous:
features_to_drop.append(var)

return features_to_drop

# if more than 1 probe feature, calculate threshold based on
# probe feature importance.
if self.probe_features_.shape[1] > 1:
def _get_threshold(self, probes):
if not len(probes):
raise ValueError(
"The selected distribution does not generate the required probes. "
"For example, if you set variables_discrete, you need to generate "
"both continuous and discrete probes."
)
if len(probes) > 1:
if self.threshold == "mean":
threshold = self.feature_importances_[
self.probe_features_.columns
].values.mean()
threshold = self.feature_importances_[probes].values.mean()
elif self.threshold == "max":
threshold = self.feature_importances_[
self.probe_features_.columns
].values.max()
threshold = self.feature_importances_[probes].values.max()
else:
threshold = (
self.feature_importances_[
self.probe_features_.columns
].values.mean()
+ 3
* self.feature_importances_[
self.probe_features_.columns
].values.std()
self.feature_importances_[probes].values.mean()
+ 3 * self.feature_importances_[probes].values.std()
)

else:
threshold = self.feature_importances_[self.probe_features_.columns].values

features_to_drop = []

for var in self.variables_:
if self.feature_importances_[var] < threshold:
features_to_drop.append(var)

return features_to_drop
threshold = self.feature_importances_[probes].values[0]
return threshold

def _more_tags(self):
tags_dict = _return_tags()
Expand Down
Loading