-
Notifications
You must be signed in to change notification settings - Fork 345
feat: add errors parameter to CategoricalImputer for multimodal variables
#908
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 21 commits
fb230fe
81be348
4fb5b7a
835133f
81f31d8
657de1f
a0ea71d
0cdcf03
cf7670e
fb2f8db
97d6053
c454edd
5992d09
85b1974
09429f3
0b86cfa
45f4e2f
cda93e7
04be1a0
94643d8
ab6ba66
6ba7fce
1a3fde2
36eb1dc
aa37d19
3e58d8b
6746429
c77e8f1
a22f586
51f8276
7156d28
5d65fe8
a95f5e0
6f5b4da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -86,6 +86,7 @@ celerybeat-schedule | |
| # Environments | ||
| .env | ||
| .venv | ||
| .venv_wsl | ||
| env/ | ||
| venv/ | ||
| ENV/ | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,6 +1,7 @@ | ||||||
| # Authors: Soledad Galli <solegalli@protonmail.com> | ||||||
| # License: BSD 3 clause | ||||||
|
|
||||||
| import warnings | ||||||
| from typing import List, Optional, Union | ||||||
|
|
||||||
| import pandas as pd | ||||||
|
|
@@ -12,12 +13,10 @@ | |||||
| _feature_names_in_docstring, | ||||||
| _imputer_dict_docstring, | ||||||
| _n_features_in_docstring, | ||||||
| _variables_attribute_docstring, | ||||||
| ) | ||||||
| from feature_engine._docstrings.methods import ( | ||||||
| _fit_transform_docstring, | ||||||
| _transform_imputers_docstring, | ||||||
| _variables_attribute_docstring | ||||||
| ) | ||||||
| from feature_engine._docstrings.methods import (_fit_transform_docstring, | ||||||
| _transform_imputers_docstring) | ||||||
| from feature_engine._docstrings.substitute import Substitution | ||||||
| from feature_engine.dataframe_checks import check_X | ||||||
| from feature_engine.imputation.base_imputer import BaseImputer | ||||||
|
|
@@ -26,7 +25,7 @@ | |||||
| check_all_variables, | ||||||
| check_categorical_variables, | ||||||
| find_all_variables, | ||||||
| find_categorical_variables, | ||||||
| find_categorical_variables | ||||||
|
direkkakkar319-ops marked this conversation as resolved.
Outdated
|
||||||
| ) | ||||||
|
|
||||||
|
|
||||||
|
|
@@ -88,6 +87,18 @@ class CategoricalImputer(BaseImputer): | |||||
| type object or categorical. If True, the imputer will select all variables or | ||||||
| accept all variables entered by the user, including those cast as numeric. | ||||||
|
|
||||||
| multimodal : str, default='raise' | ||||||
| Indicates what to do when imputation_method='frequent' | ||||||
| and a variable has more than 1 mode. | ||||||
|
|
||||||
| If 'raise', raises a ValueError and stops the fit. | ||||||
|
|
||||||
| If 'warn', raises a UserWarning and continues, imputing using the | ||||||
|
direkkakkar319-ops marked this conversation as resolved.
Outdated
|
||||||
| first most frequent category found. | ||||||
|
|
||||||
| If 'ignore', continues without warnings, imputing using the first | ||||||
| most frequent category found. | ||||||
|
|
||||||
| Attributes | ||||||
| ---------- | ||||||
| {imputer_dict_} | ||||||
|
|
@@ -135,6 +146,7 @@ def __init__( | |||||
| variables: Union[None, int, str, List[Union[str, int]]] = None, | ||||||
| return_object: bool = False, | ||||||
| ignore_format: bool = False, | ||||||
| errors: str = "raise", | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| ) -> None: | ||||||
| if imputation_method not in ["missing", "frequent"]: | ||||||
| raise ValueError( | ||||||
|
|
@@ -144,11 +156,18 @@ def __init__( | |||||
| if not isinstance(ignore_format, bool): | ||||||
| raise ValueError("ignore_format takes only booleans True and False") | ||||||
|
|
||||||
| if errors not in ["raise", "warn", "ignore"]: | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need to update all of these errors to multimodal :) |
||||||
| raise ValueError( | ||||||
| "errors takes only values 'raise', 'warn', or 'ignore'. " | ||||||
| f"Got {errors} instead." | ||||||
| ) | ||||||
|
|
||||||
| self.imputation_method = imputation_method | ||||||
| self.fill_value = fill_value | ||||||
| self.variables = _check_variables_input_value(variables) | ||||||
| self.return_object = return_object | ||||||
| self.ignore_format = ignore_format | ||||||
| self.errors = errors | ||||||
|
|
||||||
| def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): | ||||||
| """ | ||||||
|
|
@@ -189,9 +208,20 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): | |||||
|
|
||||||
| # Some variables may contain more than 1 mode: | ||||||
| if len(mode_vals) > 1: | ||||||
| raise ValueError( | ||||||
| f"The variable {var} contains multiple frequent categories." | ||||||
| ) | ||||||
| if self.errors == "raise": | ||||||
| raise ValueError( | ||||||
| f"The variable {var} contains multiple " | ||||||
| f"frequent categories. Set errors='warn' or " | ||||||
| f"errors='ignore' to allow imputation using " | ||||||
| f"the first most frequent category found." | ||||||
| ) | ||||||
| elif self.errors == "warn": | ||||||
| warnings.warn( | ||||||
| f"Variable {var} has multiple frequent " | ||||||
| f"categories. The first category found, " | ||||||
| f"{mode_vals[0]}, will be used for imputation.", | ||||||
| UserWarning, | ||||||
| ) | ||||||
|
|
||||||
| self.imputer_dict_ = {var: mode_vals[0]} | ||||||
|
|
||||||
|
|
@@ -208,10 +238,22 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): | |||||
| varnames_str = ", ".join(varnames) | ||||||
| else: | ||||||
| varnames_str = varnames[0] | ||||||
| raise ValueError( | ||||||
| f"The variable(s) {varnames_str} contain(s) multiple frequent " | ||||||
| f"categories." | ||||||
| ) | ||||||
|
|
||||||
| if self.errors == "raise": | ||||||
| raise ValueError( | ||||||
| f"The variable(s) {varnames_str} contain(s) " | ||||||
| f"multiple frequent categories. Set " | ||||||
| f"errors='warn' or errors='ignore' to allow " | ||||||
| f"imputation using the first most frequent " | ||||||
| f"category found." | ||||||
| ) | ||||||
| elif self.errors == "warn": | ||||||
| warnings.warn( | ||||||
| f"Variable(s) {varnames_str} have multiple " | ||||||
| f"frequent categories. The first category " | ||||||
| f"found will be used for imputation.", | ||||||
| UserWarning, | ||||||
| ) | ||||||
|
|
||||||
| self.imputer_dict_ = mode_vals.iloc[0].to_dict() | ||||||
|
|
||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,28 @@ | ||
| import re | ||
| import warnings | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
| import pytest | ||
|
|
||
| from feature_engine.imputation import CategoricalImputer | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def multimodal_df(): | ||
| return pd.DataFrame( | ||
| { | ||
| "city": [ | ||
| "London", "London", "Paris", "Paris", "Berlin", "Berlin", "Madrid" | ||
| ], | ||
| "country": ["UK", "UK", "FR", "FR", "DE", "DE", "ES"], | ||
| "one_mode": [ | ||
| "London", "London", "London", "Paris", "Paris", "Berlin", "Berlin" | ||
| ], | ||
| } | ||
| ) | ||
|
|
||
|
|
||
| def test_impute_with_string_missing_and_automatically_find_variables(df_na): | ||
| # set up transformer | ||
| imputer = CategoricalImputer(imputation_method="missing", variables=None) | ||
|
|
@@ -145,33 +164,40 @@ def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_n | |
|
|
||
|
|
||
| def test_error_when_imputation_method_not_frequent_or_missing(): | ||
| with pytest.raises(ValueError): | ||
| msg = "imputation_method takes only values 'missing' or 'frequent'" | ||
| with pytest.raises(ValueError, match=msg): | ||
| CategoricalImputer(imputation_method="arbitrary") | ||
|
|
||
|
|
||
| def test_error_when_variable_contains_multiple_modes(df_na): | ||
| msg = "The variable Name contains multiple frequent categories." | ||
| msg = ( | ||
|
direkkakkar319-ops marked this conversation as resolved.
Outdated
|
||
| "The variable Name contains multiple frequent categories. " | ||
| "Set errors='warn' or errors='ignore' to allow imputation " | ||
| "using the first most frequent category found." | ||
| ) | ||
| imputer = CategoricalImputer(imputation_method="frequent", variables="Name") | ||
| with pytest.raises(ValueError) as record: | ||
| with pytest.raises(ValueError, match=re.escape(msg)): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of making the test now dependent on re, we can test that it matches just part of the error message, 1 line, that's all we need :) |
||
| imputer.fit(df_na) | ||
| # check that error message matches | ||
| assert str(record.value) == msg | ||
|
|
||
| msg = "The variable(s) Name contain(s) multiple frequent categories." | ||
| msg = ( | ||
| "The variable(s) Name contain(s) multiple frequent categories. " | ||
|
direkkakkar319-ops marked this conversation as resolved.
Outdated
|
||
| "Set errors='warn' or errors='ignore' to allow imputation " | ||
| "using the first most frequent category found." | ||
| ) | ||
| imputer = CategoricalImputer(imputation_method="frequent") | ||
| with pytest.raises(ValueError) as record: | ||
| with pytest.raises(ValueError, match=re.escape(msg)): | ||
| imputer.fit(df_na) | ||
| # check that error message matches | ||
| assert str(record.value) == msg | ||
|
|
||
| df_ = df_na.copy() | ||
| df_["Name_dup"] = df_["Name"] | ||
| msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories." | ||
| msg = ( | ||
| "The variable(s) Name, Name_dup contain(s) multiple frequent categories. " | ||
| "Set errors='warn' or errors='ignore' to allow imputation " | ||
| "using the first most frequent category found." | ||
| ) | ||
| imputer = CategoricalImputer(imputation_method="frequent") | ||
| with pytest.raises(ValueError) as record: | ||
| with pytest.raises(ValueError, match=re.escape(msg)): | ||
| imputer.fit(df_) | ||
| # check that error message matches | ||
| assert str(record.value) == msg | ||
|
|
||
|
|
||
| def test_impute_numerical_variables(df_na): | ||
|
|
@@ -300,8 +326,76 @@ def test_variables_cast_as_category_frequent(df_na): | |
| ) | ||
| def test_error_when_ignore_format_is_not_boolean(ignore_format): | ||
| msg = "ignore_format takes only booleans True and False" | ||
| with pytest.raises(ValueError) as record: | ||
| with pytest.raises(ValueError, match=msg): | ||
| CategoricalImputer(imputation_method="missing", ignore_format=ignore_format) | ||
|
|
||
| # check that error message matches | ||
| assert str(record.value) == msg | ||
|
|
||
| def test_multimodal_raises_errors(multimodal_df): | ||
| imputer = CategoricalImputer(imputation_method="frequent") | ||
| msg = ( | ||
| "The variable(s) city, country contain(s) multiple frequent categories. " | ||
| "Set errors='warn' or errors='ignore' to allow imputation " | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can remove the 2nd and 3rd line and then we don't need re |
||
| "using the first most frequent category found." | ||
| ) | ||
| with pytest.raises(ValueError, match=re.escape(msg)): | ||
| imputer.fit(multimodal_df) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("errors", ["warn", "ignore"]) | ||
| def test_multimodal_imputation_result(multimodal_df, errors): | ||
| """Check that result is the same when errors='warn' or 'ignore'.""" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls remove comment |
||
| imputer = CategoricalImputer(imputation_method="frequent", errors=errors) | ||
| if errors == "warn": | ||
| with pytest.warns(UserWarning, match="multiple frequent categories"): | ||
| imputer.fit(multimodal_df) | ||
| else: | ||
| with warnings.catch_warnings(record=True) as w: | ||
| warnings.simplefilter("always") | ||
| imputer.fit(multimodal_df) | ||
| # Check that no warnings with the specific message were raised | ||
| matching_warnings = [ | ||
| msg for msg in w if "multiple frequent categories" in str(msg.message) | ||
| ] | ||
| assert len(matching_warnings) == 0 | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("errors", ["bad_value", 1, True]) | ||
| def test_errors_invalid_value_raises(errors): | ||
| """Passing an unsupported value for errors should raise ValueError at init.""" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls remove comments from all tests :) |
||
| with pytest.raises(ValueError, match="errors takes only values"): | ||
| CategoricalImputer(imputation_method="frequent", errors=errors) | ||
|
|
||
|
|
||
| def test_errors_param_ignored_when_imputation_method_is_missing(): | ||
| """errors param has no effect for imputation_method='missing'.""" | ||
| df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) | ||
| imputer = CategoricalImputer(imputation_method="missing", errors="warn") | ||
| # Should fit without warnings since there's no mode computation | ||
| with warnings.catch_warnings(record=True) as w: | ||
| warnings.simplefilter("always") | ||
| imputer.fit(df) | ||
| matching_warnings = [ | ||
| msg for msg in w if "multiple frequent categories" in str(msg.message) | ||
| ] | ||
| assert len(matching_warnings) == 0 | ||
|
|
||
|
|
||
| def test_warning_when_single_variable_is_multimodal(multimodal_df): | ||
| imputer = CategoricalImputer( | ||
| imputation_method="frequent", variables="city", errors="warn" | ||
| ) | ||
| with pytest.warns(UserWarning, match="multiple frequent categories"): | ||
| imputer.fit(multimodal_df) | ||
| assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] | ||
|
|
||
|
|
||
| def test_errors_raise_when_only_one_variable_is_multimodal(multimodal_df): | ||
| """ | ||
| This branch is reached when multiple variables are selected but only ONE of them | ||
| turns out to have multiple modes. | ||
| """ | ||
| imputer = CategoricalImputer( | ||
| imputation_method="frequent", variables=["city", "one_mode"], errors="raise" | ||
| ) | ||
| with pytest.raises(ValueError, match="city"): | ||
| imputer.fit(multimodal_df) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please make format match other imports