feature-engine · BALOGUN-DAVID · May 31, 2026 · May 31, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/feature_engine/encoding/one_hot.py b/feature_engine/encoding/one_hot.py
@@ -1,6 +1,7 @@
 # Authors: Soledad Galli <solegalli@protonmail.com>
 # License: BSD 3 clause
 
+import warnings
 from typing import List, Optional, Union
 
 import numpy as np
@@ -94,6 +95,19 @@ class OneHotEncoder(CategoricalMethodsMixin, CategoricalInitMixin):
         to `True`, will ensure that for every binary variable in the dataset, only 1
         dummy is created.
 
+    drop: str, default=None
+        Controls which category to drop when creating k-1 dummy variables. Only used
+        if `top_categories` is None. If `drop` is not None and `drop_last` is also
+        True, a `FutureWarning` is raised and `drop` takes precedence.
+
+        - ``None``: No category is dropped (k dummies). Equivalent to
+          ``drop_last=False``.
+        - ``'last'``: Drops the last category in alphabetical order.
+        - ``'first'``: Drops the first category in alphabetical order.
+        - ``'most_frequent'``: Drops the most frequent category found during ``fit()``.
+          If there is a tie, a ``UserWarning`` is raised and the first
+          category alphabetically among the tied categories is dropped.
+
     {variables}
 
     {ignore_format}
@@ -162,6 +176,7 @@ def __init__(
         top_categories: Optional[int] = None,
         drop_last: bool = False,
         drop_last_binary: bool = False,
+        drop: Optional[str] = None,
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         ignore_format: bool = False,
     ) -> None:
@@ -185,10 +200,26 @@ def __init__(
                 f"Got {drop_last_binary} instead."
             )
 
+        if drop is not None and drop not in ("last", "first", "most_frequent"):
+            raise ValueError(
+                "drop takes only values None, 'last', 'first', or "
+                f"'most_frequent'. Got {drop} instead."
+            )
+
+        if drop is not None and drop_last is True:
+            warnings.warn(
+                "Both `drop_last` and `drop` were set. `drop_last` is deprecated "
+                "when used together with `drop`. `drop` will take precedence. "
+                "In future versions, `drop_last` will be removed.",
+                FutureWarning,
+                stacklevel=2,
+            )
+
         super().__init__(variables, ignore_format)
         self.top_categories = top_categories
         self.drop_last = drop_last
         self.drop_last_binary = drop_last_binary
+        self.drop = drop
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -230,8 +261,41 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
             else:
                 category_ls = list(X[var].unique())
 
-                # return k-1 dummies
-                if self.drop_last:
+                if self.drop is not None:
+                    sorted_cats = sorted(category_ls)
+
+                    if self.drop == "last":
+                        self.encoder_dict_[var] = sorted_cats[:-1]
+
+                    elif self.drop == "first":
+                        self.encoder_dict_[var] = sorted_cats[1:]
+
+                    elif self.drop == "most_frequent":
+                        freq = X[var].value_counts()
+                        max_freq = freq.iloc[0]
+                        most_frequent_cats = freq[
+                            freq == max_freq
+                        ].index.tolist()
+
+                        if len(most_frequent_cats) > 1:
+                            cat_to_drop = sorted(most_frequent_cats)[0]
+                            warnings.warn(
+                                f"Variable '{var}': multiple categories "
+                                f"share the highest frequency ({max_freq}). "
+                                f"Dropping '{cat_to_drop}' (first "
+                                f"alphabetically among ties).",
+                                UserWarning,
+                                stacklevel=2,
+                            )
+                        else:
+                            cat_to_drop = most_frequent_cats[0]
+
+                        self.encoder_dict_[var] = [
+                            c for c in category_ls if c != cat_to_drop
+                        ]
+
+                # Legacy path: drop_last (backward compatible)
+                elif self.drop_last:
                     self.encoder_dict_[var] = category_ls[:-1]
 
                 # return k dummies

diff --git a/feature_engine/selection/probe_feature_selection.py b/feature_engine/selection/probe_feature_selection.py
@@ -87,6 +87,13 @@ class ProbeFeatureSelection(BaseSelector):
 
     {variables}
 
+    variables_discrete: list, default=None
+        A list of discrete variables. If None, all variables are treated equally and
+        their feature importance is compared to the feature importance of all probe
+        features. If passed, the discrete variables will be compared to the discrete
+        probe features, and the continuous variables will be compared to the
+        continuous probe features.
+
     collective: bool, default=True
          Whether the feature importance should be derived from an estimator trained on
          the entire dataset (True), or trained using individual features (False).
@@ -140,6 +147,9 @@ class ProbeFeatureSelection(BaseSelector):
 
     {variables_}
 
+    variables_discrete_:
+        A list of discrete variables to be compared with discrete probes.
+
     {feature_names_in_}
 
     {n_features_in_}
@@ -183,6 +193,7 @@ def __init__(
         self,
         estimator,
         variables: Variables = None,
+        variables_discrete: Variables = None,
         collective: bool = True,
         scoring: str = "roc_auc",
         n_probes: int = 1,
@@ -244,6 +255,7 @@ def __init__(
         super().__init__(confirm_variables)
         self.estimator = estimator
         self.variables = variables
+        self.variables_discrete = variables_discrete
         self.collective = collective
         self.scoring = scoring
         self.distribution = distribution
@@ -272,6 +284,19 @@ def fit(self, X: pd.DataFrame, y: pd.Series):
             X, self.variables, self.confirm_variables
         )
 
+        if self.variables_discrete is not None:
+            self.variables_discrete_ = _select_numerical_variables(
+                X, self.variables_discrete, self.confirm_variables
+            )
+            for var in self.variables_discrete_:
+                if var not in self.variables_:
+                    raise ValueError(
+                        f"Variable {var} is present in variables_discrete "
+                        f"but not in variables."
+                    )
+        else:
+            self.variables_discrete_ = None
+
         # save input features
         self._get_feature_names_in(X)
 
@@ -360,39 +385,58 @@ def _get_features_to_drop(self):
         Identify the variables that have a lower feature importance than the average
         feature importance of all the probe features.
         """
+        features_to_drop = []
+
+        if self.variables_discrete_ is None:
+            threshold = self._get_threshold(self.probe_features_.columns)
+            for var in self.variables_:
+                if self.feature_importances_[var] < threshold:
+                    features_to_drop.append(var)
+        else:
+            discrete_probes = [
+                c
+                for c in self.probe_features_.columns
+                if "binary" in c or "discrete_uniform" in c or "poisson" in c
+            ]
+            continuous_probes = [
+                c
+                for c in self.probe_features_.columns
+                if "gaussian" in c or ("uniform" in c and "discrete_uniform" not in c)
+            ]
+
+            threshold_discrete = self._get_threshold(discrete_probes)
+            threshold_continuous = self._get_threshold(continuous_probes)
+
+            for var in self.variables_:
+                if var in self.variables_discrete_:
+                    if self.feature_importances_[var] < threshold_discrete:
+                        features_to_drop.append(var)
+                else:
+                    if self.feature_importances_[var] < threshold_continuous:
+                        features_to_drop.append(var)
+
+        return features_to_drop
 
-        # if more than 1 probe feature, calculate threshold based on
-        # probe feature importance.
-        if self.probe_features_.shape[1] > 1:
+    def _get_threshold(self, probes):
+        if not len(probes):
+            raise ValueError(
+                "The selected distribution does not generate the required probes. "
+                "For example, if you set variables_discrete, you need to generate "
+                "both continuous and discrete probes."
+            )
+        if len(probes) > 1:
             if self.threshold == "mean":
-                threshold = self.feature_importances_[
-                    self.probe_features_.columns
-                ].values.mean()
+                threshold = self.feature_importances_[probes].values.mean()
             elif self.threshold == "max":
-                threshold = self.feature_importances_[
-                    self.probe_features_.columns
-                ].values.max()
+                threshold = self.feature_importances_[probes].values.max()
             else:
                 threshold = (
-                    self.feature_importances_[
-                        self.probe_features_.columns
-                    ].values.mean()
-                    + 3
-                    * self.feature_importances_[
-                        self.probe_features_.columns
-                    ].values.std()
+                    self.feature_importances_[probes].values.mean()
+                    + 3 * self.feature_importances_[probes].values.std()
                 )
-
         else:
-            threshold = self.feature_importances_[self.probe_features_.columns].values
-
-        features_to_drop = []
-
-        for var in self.variables_:
-            if self.feature_importances_[var] < threshold:
-                features_to_drop.append(var)
-
-        return features_to_drop
+            threshold = self.feature_importances_[probes].values[0]
+        return threshold
 
     def _more_tags(self):
         tags_dict = _return_tags()