Skip to content
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
fb230fe
feat(CategoricalImputer): add errors param to handle multimodal varia…
direkkakkar319-ops Mar 8, 2026
81be348
style: fix flake8 line length in CategoricalImputer
direkkakkar319-ops Mar 8, 2026
4fb5b7a
style: fix import order and duplicate pandas import
direkkakkar319-ops Mar 8, 2026
835133f
test: add coverage for errors='ignore' branches
direkkakkar319-ops Mar 8, 2026
81f31d8
style: add missing newline at end of test file
direkkakkar319-ops Mar 8, 2026
657de1f
Changes for codedev tests
direkkakkar319-ops Mar 9, 2026
a0ea71d
added space at last of test_categorical_imputer.py
direkkakkar319-ops Mar 16, 2026
0cdcf03
Revert docs/whats_new/v_190.rst to upstream version
direkkakkar319-ops Mar 26, 2026
cf7670e
changes done to `feature_engine/imputation/categorical.py`
direkkakkar319-ops Mar 26, 2026
fb2f8db
changes made to `tests/test_imputation/test_categorical_imputer.py`
direkkakkar319-ops Mar 26, 2026
97d6053
resolved comment done on R15
direkkakkar319-ops Mar 26, 2026
c454edd
reformated the error tests to match the error from within pytest
direkkakkar319-ops Mar 26, 2026
5992d09
made three tests in on test
direkkakkar319-ops Mar 26, 2026
85b1974
left change
direkkakkar319-ops Mar 26, 2026
09429f3
refaactored the multimodal tests
direkkakkar319-ops Mar 26, 2026
0b86cfa
refactored test_errors_invalid_value_raises
direkkakkar319-ops Mar 26, 2026
45f4e2f
changed the function `test_errors_param_ignored_when_imputation_metho…
direkkakkar319-ops Mar 26, 2026
cda93e7
removed `test_errors_ignore_single_variable` `test_errors_ignore_mult…
direkkakkar319-ops Mar 26, 2026
04be1a0
emove the commented block
direkkakkar319-ops Mar 26, 2026
94643d8
last few changes made
direkkakkar319-ops Mar 26, 2026
ab6ba66
test case style updated
direkkakkar319-ops Mar 26, 2026
6ba7fce
Renamed `errors` to `multimodal` in CategoricalImputer and add missin…
direkkakkar319-ops Mar 27, 2026
1a3fde2
Apply suggestion from @solegalli
direkkakkar319-ops Mar 27, 2026
36eb1dc
Apply suggestion from @solegalli
direkkakkar319-ops Mar 27, 2026
aa37d19
Update categorical.py
direkkakkar319-ops Mar 27, 2026
3e58d8b
removed comments and added tests
direkkakkar319-ops Mar 27, 2026
6746429
Merge branch 'issue-904-categorical-imputer-multimodal' of https://gi…
direkkakkar319-ops Mar 27, 2026
c77e8f1
Update .gitignore
direkkakkar319-ops Mar 27, 2026
a22f586
removed the spaces
direkkakkar319-ops Mar 27, 2026
51f8276
Merge branch 'issue-904-categorical-imputer-multimodal' of https://gi…
direkkakkar319-ops Mar 27, 2026
7156d28
removed the spaces
direkkakkar319-ops Mar 27, 2026
5d65fe8
simplified the test case as asked
direkkakkar319-ops Mar 27, 2026
a95f5e0
simplified the test case as asked
direkkakkar319-ops Mar 27, 2026
6f5b4da
simplified the test case as asked
direkkakkar319-ops Mar 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ celerybeat-schedule
# Environments
.env
.venv
.venv_wsl
env/
venv/
ENV/
Expand Down
68 changes: 55 additions & 13 deletions feature_engine/imputation/categorical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Authors: Soledad Galli <solegalli@protonmail.com>
# License: BSD 3 clause

import warnings
from typing import List, Optional, Union

import pandas as pd
Expand All @@ -12,12 +13,10 @@
_feature_names_in_docstring,
_imputer_dict_docstring,
_n_features_in_docstring,
_variables_attribute_docstring,
)
from feature_engine._docstrings.methods import (
_fit_transform_docstring,
_transform_imputers_docstring,
_variables_attribute_docstring
)
from feature_engine._docstrings.methods import (_fit_transform_docstring,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please make format match other imports

_transform_imputers_docstring)
from feature_engine._docstrings.substitute import Substitution
from feature_engine.dataframe_checks import check_X
from feature_engine.imputation.base_imputer import BaseImputer
Expand All @@ -26,7 +25,7 @@
check_all_variables,
check_categorical_variables,
find_all_variables,
find_categorical_variables,
find_categorical_variables
Comment thread
direkkakkar319-ops marked this conversation as resolved.
Outdated
)


Expand Down Expand Up @@ -88,6 +87,18 @@ class CategoricalImputer(BaseImputer):
type object or categorical. If True, the imputer will select all variables or
accept all variables entered by the user, including those cast as numeric.

multimodal : str, default='raise'
Indicates what to do when imputation_method='frequent'
and a variable has more than 1 mode.

If 'raise', raises a ValueError and stops the fit.

If 'warn', raises a UserWarning and continues, imputing using the
Comment thread
direkkakkar319-ops marked this conversation as resolved.
Outdated
first most frequent category found.

If 'ignore', continues without warnings, imputing using the first
most frequent category found.

Attributes
----------
{imputer_dict_}
Expand Down Expand Up @@ -135,6 +146,7 @@ def __init__(
variables: Union[None, int, str, List[Union[str, int]]] = None,
return_object: bool = False,
ignore_format: bool = False,
errors: str = "raise",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
errors: str = "raise",
multimodal: str = "raise",

) -> None:
if imputation_method not in ["missing", "frequent"]:
raise ValueError(
Expand All @@ -144,11 +156,18 @@ def __init__(
if not isinstance(ignore_format, bool):
raise ValueError("ignore_format takes only booleans True and False")

if errors not in ["raise", "warn", "ignore"]:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to update all of these errors to multimodal :)

raise ValueError(
"errors takes only values 'raise', 'warn', or 'ignore'. "
f"Got {errors} instead."
)

self.imputation_method = imputation_method
self.fill_value = fill_value
self.variables = _check_variables_input_value(variables)
self.return_object = return_object
self.ignore_format = ignore_format
self.errors = errors

def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
"""
Expand Down Expand Up @@ -189,9 +208,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):

# Some variables may contain more than 1 mode:
if len(mode_vals) > 1:
raise ValueError(
f"The variable {var} contains multiple frequent categories."
)
if self.errors == "raise":
raise ValueError(
f"The variable {var} contains multiple "
f"frequent categories. Set errors='warn' or "
f"errors='ignore' to allow imputation using "
f"the first most frequent category found."
)
elif self.errors == "warn":
warnings.warn(
f"Variable {var} has multiple frequent "
f"categories. The first category found, "
f"{mode_vals[0]}, will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = {var: mode_vals[0]}

Expand All @@ -208,10 +238,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
varnames_str = ", ".join(varnames)
else:
varnames_str = varnames[0]
raise ValueError(
f"The variable(s) {varnames_str} contain(s) multiple frequent "
f"categories."
)

if self.errors == "raise":
raise ValueError(
f"The variable(s) {varnames_str} contain(s) "
f"multiple frequent categories. Set "
f"errors='warn' or errors='ignore' to allow "
f"imputation using the first most frequent "
f"category found."
)
elif self.errors == "warn":
warnings.warn(
f"Variable(s) {varnames_str} have multiple "
f"frequent categories. The first category "
f"found will be used for imputation.",
UserWarning,
)

self.imputer_dict_ = mode_vals.iloc[0].to_dict()

Expand Down
126 changes: 110 additions & 16 deletions tests/test_imputation/test_categorical_imputer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,28 @@
import re
import warnings

import numpy as np
import pandas as pd
import pytest

from feature_engine.imputation import CategoricalImputer


@pytest.fixture
def multimodal_df():
return pd.DataFrame(
{
"city": [
"London", "London", "Paris", "Paris", "Berlin", "Berlin", "Madrid"
],
"country": ["UK", "UK", "FR", "FR", "DE", "DE", "ES"],
"one_mode": [
"London", "London", "London", "Paris", "Paris", "Berlin", "Berlin"
],
}
)


def test_impute_with_string_missing_and_automatically_find_variables(df_na):
# set up transformer
imputer = CategoricalImputer(imputation_method="missing", variables=None)
Expand Down Expand Up @@ -145,33 +164,40 @@ def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_n


def test_error_when_imputation_method_not_frequent_or_missing():
with pytest.raises(ValueError):
msg = "imputation_method takes only values 'missing' or 'frequent'"
with pytest.raises(ValueError, match=msg):
CategoricalImputer(imputation_method="arbitrary")


def test_error_when_variable_contains_multiple_modes(df_na):
msg = "The variable Name contains multiple frequent categories."
msg = (
Comment thread
direkkakkar319-ops marked this conversation as resolved.
Outdated
"The variable Name contains multiple frequent categories. "
"Set errors='warn' or errors='ignore' to allow imputation "
"using the first most frequent category found."
)
imputer = CategoricalImputer(imputation_method="frequent", variables="Name")
with pytest.raises(ValueError) as record:
with pytest.raises(ValueError, match=re.escape(msg)):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of making the test now dependent on re, we can test that it matches just part of the error message, 1 line, that's all we need :)

imputer.fit(df_na)
# check that error message matches
assert str(record.value) == msg

msg = "The variable(s) Name contain(s) multiple frequent categories."
msg = (
"The variable(s) Name contain(s) multiple frequent categories. "
Comment thread
direkkakkar319-ops marked this conversation as resolved.
Outdated
"Set errors='warn' or errors='ignore' to allow imputation "
"using the first most frequent category found."
)
imputer = CategoricalImputer(imputation_method="frequent")
with pytest.raises(ValueError) as record:
with pytest.raises(ValueError, match=re.escape(msg)):
imputer.fit(df_na)
# check that error message matches
assert str(record.value) == msg

df_ = df_na.copy()
df_["Name_dup"] = df_["Name"]
msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories."
msg = (
"The variable(s) Name, Name_dup contain(s) multiple frequent categories. "
"Set errors='warn' or errors='ignore' to allow imputation "
"using the first most frequent category found."
)
imputer = CategoricalImputer(imputation_method="frequent")
with pytest.raises(ValueError) as record:
with pytest.raises(ValueError, match=re.escape(msg)):
imputer.fit(df_)
# check that error message matches
assert str(record.value) == msg


def test_impute_numerical_variables(df_na):
Expand Down Expand Up @@ -300,8 +326,76 @@ def test_variables_cast_as_category_frequent(df_na):
)
def test_error_when_ignore_format_is_not_boolean(ignore_format):
msg = "ignore_format takes only booleans True and False"
with pytest.raises(ValueError) as record:
with pytest.raises(ValueError, match=msg):
CategoricalImputer(imputation_method="missing", ignore_format=ignore_format)

# check that error message matches
assert str(record.value) == msg

def test_multimodal_raises_errors(multimodal_df):
imputer = CategoricalImputer(imputation_method="frequent")
msg = (
"The variable(s) city, country contain(s) multiple frequent categories. "
"Set errors='warn' or errors='ignore' to allow imputation "
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can remove the 2nd and 3rd line and then we don't need re

"using the first most frequent category found."
)
with pytest.raises(ValueError, match=re.escape(msg)):
imputer.fit(multimodal_df)


@pytest.mark.parametrize("errors", ["warn", "ignore"])
def test_multimodal_imputation_result(multimodal_df, errors):
"""Check that result is the same when errors='warn' or 'ignore'."""
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls remove comment

imputer = CategoricalImputer(imputation_method="frequent", errors=errors)
if errors == "warn":
with pytest.warns(UserWarning, match="multiple frequent categories"):
imputer.fit(multimodal_df)
else:
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
imputer.fit(multimodal_df)
# Check that no warnings with the specific message were raised
matching_warnings = [
msg for msg in w if "multiple frequent categories" in str(msg.message)
]
assert len(matching_warnings) == 0


@pytest.mark.parametrize("errors", ["bad_value", 1, True])
def test_errors_invalid_value_raises(errors):
"""Passing an unsupported value for errors should raise ValueError at init."""
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls remove comments from all tests :)

with pytest.raises(ValueError, match="errors takes only values"):
CategoricalImputer(imputation_method="frequent", errors=errors)


def test_errors_param_ignored_when_imputation_method_is_missing():
"""errors param has no effect for imputation_method='missing'."""
df = pd.DataFrame({"city": ["London", np.nan, "Paris"]})
imputer = CategoricalImputer(imputation_method="missing", errors="warn")
# Should fit without warnings since there's no mode computation
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
imputer.fit(df)
matching_warnings = [
msg for msg in w if "multiple frequent categories" in str(msg.message)
]
assert len(matching_warnings) == 0


def test_warning_when_single_variable_is_multimodal(multimodal_df):
imputer = CategoricalImputer(
imputation_method="frequent", variables="city", errors="warn"
)
with pytest.warns(UserWarning, match="multiple frequent categories"):
imputer.fit(multimodal_df)
assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0]


def test_errors_raise_when_only_one_variable_is_multimodal(multimodal_df):
"""
This branch is reached when multiple variables are selected but only ONE of them
turns out to have multiple modes.
"""
imputer = CategoricalImputer(
imputation_method="frequent", variables=["city", "one_mode"], errors="raise"
)
with pytest.raises(ValueError, match="city"):
imputer.fit(multimodal_df)