feature-engine · direkkakkar319-ops · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -86,6 +86,7 @@ celerybeat-schedule
 # Environments
 .env
 .venv
+.venv_wsl
 env/
 venv/
 ENV/

diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py
@@ -1,6 +1,7 @@
 # Authors: Soledad Galli <solegalli@protonmail.com>
 # License: BSD 3 clause
 
+import warnings
 from typing import List, Optional, Union
 
 import pandas as pd
@@ -12,12 +13,10 @@
     _feature_names_in_docstring,
     _imputer_dict_docstring,
     _n_features_in_docstring,
-    _variables_attribute_docstring,
-)
-from feature_engine._docstrings.methods import (
-    _fit_transform_docstring,
-    _transform_imputers_docstring,
+    _variables_attribute_docstring
 )
+from feature_engine._docstrings.methods import (_fit_transform_docstring,
+                                                _transform_imputers_docstring)
 from feature_engine._docstrings.substitute import Substitution
 from feature_engine.dataframe_checks import check_X
 from feature_engine.imputation.base_imputer import BaseImputer
@@ -26,7 +25,7 @@
     check_all_variables,
     check_categorical_variables,
     find_all_variables,
-    find_categorical_variables,
+    find_categorical_variables
 )
 
 
@@ -88,6 +87,18 @@ class CategoricalImputer(BaseImputer):
         type object or categorical. If True, the imputer will select all variables or
         accept all variables entered by the user, including those cast as numeric.
 
+    multimodal : str, default='raise'
+        Indicates what to do when imputation_method='frequent'
+        and a variable has more than 1 mode.
+
+        If 'raise', raises a ValueError and stops the fit.
+
+        If 'warn', raises a UserWarning and continues, imputing using the
+        first most frequent category found.
+
+        If 'ignore', continues without warnings, imputing using the first
+        most frequent category found.
+
     Attributes
     ----------
     {imputer_dict_}
@@ -135,6 +146,7 @@ def __init__(
         variables: Union[None, int, str, List[Union[str, int]]] = None,
         return_object: bool = False,
         ignore_format: bool = False,
+        errors: str = "raise",
-        errors: str = "raise",
+        multimodal: str = "raise",
-        errors: str = "raise",
+        multimodal: str = "raise",
     ) -> None:
         if imputation_method not in ["missing", "frequent"]:
             raise ValueError(
@@ -144,11 +156,18 @@ def __init__(
         if not isinstance(ignore_format, bool):
             raise ValueError("ignore_format takes only booleans True and False")
 
+        if errors not in ["raise", "warn", "ignore"]:
+            raise ValueError(
+                "errors takes only values 'raise', 'warn', or 'ignore'. "
+                f"Got {errors} instead."
+            )
+
         self.imputation_method = imputation_method
         self.fill_value = fill_value
         self.variables = _check_variables_input_value(variables)
         self.return_object = return_object
         self.ignore_format = ignore_format
+        self.errors = errors
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
@@ -189,9 +208,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
 
                 # Some variables may contain more than 1 mode:
                 if len(mode_vals) > 1:
-                    raise ValueError(
-                        f"The variable {var} contains multiple frequent categories."
-                    )
+                    if self.errors == "raise":
+                        raise ValueError(
+                            f"The variable {var} contains multiple "
+                            f"frequent categories. Set errors='warn' or "
+                            f"errors='ignore' to allow imputation using "
+                            f"the first most frequent category found."
+                        )
+                    elif self.errors == "warn":
+                        warnings.warn(
+                            f"Variable {var} has multiple frequent "
+                            f"categories. The first category found, "
+                            f"{mode_vals[0]}, will be used for imputation.",
+                            UserWarning,
+                        )
 
                 self.imputer_dict_ = {var: mode_vals[0]}
 
@@ -208,10 +238,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                         varnames_str = ", ".join(varnames)
                     else:
                         varnames_str = varnames[0]
-                    raise ValueError(
-                        f"The variable(s) {varnames_str} contain(s) multiple frequent "
-                        f"categories."
-                    )
+
+                    if self.errors == "raise":
+                        raise ValueError(
+                            f"The variable(s) {varnames_str} contain(s) "
+                            f"multiple frequent categories. Set "
+                            f"errors='warn' or errors='ignore' to allow "
+                            f"imputation using the first most frequent "
+                            f"category found."
+                        )
+                    elif self.errors == "warn":
+                        warnings.warn(
+                            f"Variable(s) {varnames_str} have multiple "
+                            f"frequent categories. The first category "
+                            f"found will be used for imputation.",
+                            UserWarning,
+                        )
 
                 self.imputer_dict_ = mode_vals.iloc[0].to_dict()
 

diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py
@@ -1,9 +1,28 @@
+import re
+import warnings
+
+import numpy as np
 import pandas as pd
 import pytest
 
 from feature_engine.imputation import CategoricalImputer
 
 
+@pytest.fixture
+def multimodal_df():
+    return pd.DataFrame(
+        {
+            "city": [
+                "London", "London", "Paris", "Paris", "Berlin", "Berlin", "Madrid"
+            ],
+            "country": ["UK", "UK", "FR", "FR", "DE", "DE", "ES"],
+            "one_mode": [
+                "London", "London", "London", "Paris", "Paris", "Berlin", "Berlin"
+            ],
+        }
+    )
+
+
 def test_impute_with_string_missing_and_automatically_find_variables(df_na):
     # set up transformer
     imputer = CategoricalImputer(imputation_method="missing", variables=None)
@@ -145,33 +164,40 @@ def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_n
 
 
 def test_error_when_imputation_method_not_frequent_or_missing():
-    with pytest.raises(ValueError):
+    msg = "imputation_method takes only values 'missing' or 'frequent'"
+    with pytest.raises(ValueError, match=msg):
         CategoricalImputer(imputation_method="arbitrary")
 
 
 def test_error_when_variable_contains_multiple_modes(df_na):
-    msg = "The variable Name contains multiple frequent categories."
+    msg = (
+        "The variable Name contains multiple frequent categories. "
+        "Set errors='warn' or errors='ignore' to allow imputation "
+        "using the first most frequent category found."
+    )
     imputer = CategoricalImputer(imputation_method="frequent", variables="Name")
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(ValueError, match=re.escape(msg)):
         imputer.fit(df_na)
-    # check that error message matches
-    assert str(record.value) == msg
 
-    msg = "The variable(s) Name contain(s) multiple frequent categories."
+    msg = (
+        "The variable(s) Name contain(s) multiple frequent categories. "
+        "Set errors='warn' or errors='ignore' to allow imputation "
+        "using the first most frequent category found."
+    )
     imputer = CategoricalImputer(imputation_method="frequent")
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(ValueError, match=re.escape(msg)):
         imputer.fit(df_na)
-    # check that error message matches
-    assert str(record.value) == msg
 
     df_ = df_na.copy()
     df_["Name_dup"] = df_["Name"]
-    msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories."
+    msg = (
+        "The variable(s) Name, Name_dup contain(s) multiple frequent categories. "
+        "Set errors='warn' or errors='ignore' to allow imputation "
+        "using the first most frequent category found."
+    )
     imputer = CategoricalImputer(imputation_method="frequent")
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(ValueError, match=re.escape(msg)):
         imputer.fit(df_)
-    # check that error message matches
-    assert str(record.value) == msg
 
 
 def test_impute_numerical_variables(df_na):
@@ -300,8 +326,76 @@ def test_variables_cast_as_category_frequent(df_na):
 )
 def test_error_when_ignore_format_is_not_boolean(ignore_format):
     msg = "ignore_format takes only booleans True and False"
-    with pytest.raises(ValueError) as record:
+    with pytest.raises(ValueError, match=msg):
         CategoricalImputer(imputation_method="missing", ignore_format=ignore_format)
 
-    # check that error message matches
-    assert str(record.value) == msg
+
+def test_multimodal_raises_errors(multimodal_df):
+    imputer = CategoricalImputer(imputation_method="frequent")
+    msg = (
+        "The variable(s) city, country contain(s) multiple frequent categories. "
+        "Set errors='warn' or errors='ignore' to allow imputation "
+        "using the first most frequent category found."
+    )
+    with pytest.raises(ValueError, match=re.escape(msg)):
+        imputer.fit(multimodal_df)
+
+
+@pytest.mark.parametrize("errors", ["warn", "ignore"])
+def test_multimodal_imputation_result(multimodal_df, errors):
+    """Check that result is the same when errors='warn' or 'ignore'."""
+    imputer = CategoricalImputer(imputation_method="frequent", errors=errors)
+    if errors == "warn":
+        with pytest.warns(UserWarning, match="multiple frequent categories"):
+            imputer.fit(multimodal_df)
+    else:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            imputer.fit(multimodal_df)
+            # Check that no warnings with the specific message were raised
+            matching_warnings = [
+                msg for msg in w if "multiple frequent categories" in str(msg.message)
+            ]
+            assert len(matching_warnings) == 0
+
+
+@pytest.mark.parametrize("errors", ["bad_value", 1, True])
+def test_errors_invalid_value_raises(errors):
+    """Passing an unsupported value for errors should raise ValueError at init."""
+    with pytest.raises(ValueError, match="errors takes only values"):
+        CategoricalImputer(imputation_method="frequent", errors=errors)
+
+
+def test_errors_param_ignored_when_imputation_method_is_missing():
+    """errors param has no effect for imputation_method='missing'."""
+    df = pd.DataFrame({"city": ["London", np.nan, "Paris"]})
+    imputer = CategoricalImputer(imputation_method="missing", errors="warn")
+    # Should fit without warnings since there's no mode computation
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        imputer.fit(df)
+        matching_warnings = [
+            msg for msg in w if "multiple frequent categories" in str(msg.message)
+        ]
+        assert len(matching_warnings) == 0
+
+
+def test_warning_when_single_variable_is_multimodal(multimodal_df):
+    imputer = CategoricalImputer(
+        imputation_method="frequent", variables="city", errors="warn"
+    )
+    with pytest.warns(UserWarning, match="multiple frequent categories"):
+        imputer.fit(multimodal_df)
+    assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0]
+
+
+def test_errors_raise_when_only_one_variable_is_multimodal(multimodal_df):
+    """
+    This branch is reached when multiple variables are selected but only ONE of them
+    turns out to have multiple modes.
+    """
+    imputer = CategoricalImputer(
+        imputation_method="frequent", variables=["city", "one_mode"], errors="raise"
+    )
+    with pytest.raises(ValueError, match="city"):
+        imputer.fit(multimodal_df)
-Original file line number
+Diff line change
@@ Expand Up / @@ -86,6 +86,7 @@ celerybeat-schedule @@
     # Environments
     .env
     .venv
+    .venv_wsl
     env/
     venv/
     ENV/
@@ Expand Down @@