Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 33 additions & 5 deletions tools/import_validation/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ class ValidationRunner:
"""

def __init__(self, validation_config_path: str, differ_output: str,
stats_summary: str, lint_report: str, validation_output: str):
stats_summary: str, lint_report: str, validation_output: str,
counters_report: str = None):
self.config = ValidationConfig(validation_config_path)
self.validation_output = validation_output
self.validator = Validator()
Expand All @@ -49,7 +50,8 @@ def __init__(self, validation_config_path: str, differ_output: str,
'stats': pd.DataFrame(),
'differ': pd.DataFrame(),
'differ_summary': {},
'lint': {}
'lint': {},
'counters': {}
}

self.validation_dispatch = {
Expand Down Expand Up @@ -85,12 +87,22 @@ def __init__(self, validation_config_path: str, differ_output: str,
'MAX_VALUE_CHECK':
(self.validator.validate_max_value_check, 'stats'),
'GOLDENS_CHECK': (self.validator.validate_goldens, 'stats'),
'COUNTER_ZERO_CHECK':
(self.validator.validate_counter_zero, 'counters'),
'COUNTER_MAX_THRESHOLD':
(self.validator.validate_counter_max_threshold, 'counters'),
'COUNTER_RATIO_THRESHOLD':
(self.validator.validate_counter_ratio_threshold, 'counters'),
'COUNTER_SUM_INTEGRITY':
(self.validator.validate_counter_sum_integrity, 'counters'),
'COUNTER_MIN_YIELD':
(self.validator.validate_counter_min_yield, 'counters'),
}

self._initialize_data_sources(stats_summary, lint_report, differ_output)
self._initialize_data_sources(stats_summary, lint_report, differ_output, counters_report)

def _initialize_data_sources(self, stats_summary: str, lint_report: str,
differ_output: str):
differ_output: str, counters_report: str = None):
"""
Checks for and loads the required data sources based on the config.
"""
Expand Down Expand Up @@ -167,6 +179,19 @@ def _initialize_data_sources(self, stats_summary: str, lint_report: str,
logging.warning("lint_report file exists but is empty: %s",
lint_report)

if counters_report and os.path.exists(counters_report) and os.path.getsize(
counters_report) > 0:
try:
with open(counters_report, 'r') as f:
self.data_sources['counters'] = json.load(f)
except Exception as e:
logging.error(
f"JSON parse error while reading counters report at {counters_report}: {e}"
)
Comment on lines +189 to +208

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

It is recommended to raise an exception if the counters report fails to parse, rather than just logging an error. Continuing with an empty counters dictionary could lead to incorrect validation results (e.g., COUNTER_ZERO_CHECK passing when it should have failed because the data was missing). Raising a ValueError ensures the user is aware that the validation process is incomplete due to corrupted input data, enforcing the correctness of the validation process.

Suggested change
try:
with open(counters_report, 'r') as f:
self.data_sources['counters'] = json.load(f)
except Exception as e:
logging.error(
f"JSON parse error while reading counters report at {counters_report}: {e}"
)
try:
with open(counters_report, 'r') as f:
self.data_sources['counters'] = json.load(f)
except (json.JSONDecodeError, OSError) as e:
raise ValueError(
f"Failed to parse counters report at {counters_report}: {e}"
) from e
References
  1. When a configuration dictionary like resource_limits is provided, it is acceptable to assume it contains all required keys and allow it to fail on a KeyError if a key is missing, rather than defensively using default values. This enforces configuration correctness.

Comment on lines +205 to +208

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Avoid catching broad exceptions. It is better to catch specific errors that pd.read_csv might raise, such as pd.errors.ParserError or pd.errors.EmptyDataError.

Suggested change
except Exception as e:
logging.error(
f"CSV parse error while reading counters report at {counters_report}: {e}"
)
except pd.errors.ParserError as e:
logging.error(
f"CSV parse error while reading counters report at {counters_report}: {e}"
)

elif counters_report and os.path.exists(counters_report):
logging.warning("counters_report file exists but is empty: %s",
counters_report)

def _determine_required_sources(self) -> set[str]:
"""
Parses the validation config to determine which data sources are required.
Expand Down Expand Up @@ -265,7 +290,8 @@ def main(_):
differ_output=_FLAGS.differ_output,
stats_summary=_FLAGS.stats_summary,
lint_report=_FLAGS.lint_report,
validation_output=_FLAGS.validation_output)
validation_output=_FLAGS.validation_output,
counters_report=_FLAGS.counters_report)
overall_status, _ = runner.run_validations()
if not overall_status:
sys.exit(1)
Expand All @@ -283,6 +309,8 @@ def main(_):
'Path to the stats summary report file.')
flags.DEFINE_string('lint_report', None,
'Path to the mcf lint report file.')
flags.DEFINE_string('counters_report', None,
'Path to the counters report file.')
flags.DEFINE_string('validation_output', None,
'Path to the validation output file.')
flags.mark_flag_as_required('validation_output')
Expand Down
54 changes: 54 additions & 0 deletions tools/import_validation/runner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,57 @@ def test_init_raises_error_if_required_file_is_missing(self):
lint_report=self.report_path,
validation_output=self.output_path)
self.assertIn("'stats' data source", str(context.exception))


class TestCountersIntegration(unittest.TestCase):
'''Test Class for counter validations integration in ValidationRunner.'''

def setUp(self):
self.test_dir = tempfile.TemporaryDirectory()
self.config_path = os.path.join(self.test_dir.name, 'config.json')
self.stats_path = os.path.join(self.test_dir.name, 'stats.csv')
self.report_path = os.path.join(self.test_dir.name, 'report.json')
self.differ_path = os.path.join(self.test_dir.name, 'differ.csv')
self.output_path = os.path.join(self.test_dir.name, 'output.csv')
self.counters_path = os.path.join(self.test_dir.name, 'counters.json')

def tearDown(self):
self.test_dir.cleanup()

@patch('tools.import_validation.runner.Validator')
def test_runner_loads_counters_and_calls_validator(self, MockValidator):
# 1. Setup the mock
mock_validator_instance = MockValidator.return_value
mock_validator_instance.validate_counter_zero.return_value = ValidationResult(
ValidationStatus.PASSED, 'COUNTER_ZERO_CHECK')

# 2. Create test files
with open(self.config_path, 'w') as f:
json.dump(
{
'rules': [{
'rule_id': 'check_zero_errors',
'validator': 'COUNTER_ZERO_CHECK',
'params': {'counter_name': 'invalid-lat-lng'}
}]
}, f)

counters_data = {'invalid-lat-lng': 0}
with open(self.counters_path, 'w') as f:
json.dump(counters_data, f)

# 3. Run the runner
runner = ValidationRunner(
validation_config_path=self.config_path,
stats_summary=self.stats_path,
differ_output=self.differ_path,
lint_report=self.report_path,
validation_output=self.output_path,
counters_report=self.counters_path)
runner.run_validations()

# 4. Assert that the correct method was called on the mock with loaded counters
mock_validator_instance.validate_counter_zero.assert_called_once()
call_args, _ = mock_validator_instance.validate_counter_zero.call_args
self.assertEqual(call_args[0]['invalid-lat-lng'], 0)

240 changes: 240 additions & 0 deletions tools/import_validation/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,3 +1031,243 @@ def validate_goldens(self, df: pd.DataFrame,
ValidationStatus.DATA_ERROR,
'GOLDENS_CHECK',
message=f"Error during golden validation: {e}")

def validate_counter_zero(self, counters: dict,
params: dict) -> ValidationResult:
"""Checks that a specific counter is strictly zero.

Args:
counters: A dictionary containing the counter values.
params: A dictionary containing the validation parameters, which must
have a 'counter_name' key.

Returns:
A ValidationResult object.
"""
if 'counter_name' not in params:
return ValidationResult(
ValidationStatus.CONFIG_ERROR,
'COUNTER_ZERO_CHECK',
message="Configuration error: 'counter_name' key not specified."
)

counter_name = params['counter_name']
value = counters.get(counter_name, 0)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider casting the counter value to a numeric type (e.g., float) to ensure robustness against JSON files where counts might be represented as strings. In alignment with repository practices, we avoid using defensive default values (like 0) for missing keys to ensure that missing required data is caught as an error. This is consistent with how other validators in this class handle report data (e.g., validate_lint_errors_count).

Suggested change
value = counters.get(counter_name, 0)
value = float(counters[counter_name])
References
  1. When a configuration dictionary like resource_limits is provided, it is acceptable to assume it contains all required keys and allow it to fail on a KeyError if a key is missing, rather than defensively using default values. This enforces configuration correctness.
  2. When naming a variable for a count of items, prefer the pattern plural_noun_count (e.g., records_count) over singular_noun_counts (e.g., record_counts).


if value > 0:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The check value > 0 might pass if a counter somehow becomes negative, which would still violate the 'strictly zero' requirement. Using value != 0 is safer and more accurate for a zero check.

Suggested change
if value > 0:
if value != 0:

return ValidationResult(
ValidationStatus.FAILED,
'COUNTER_ZERO_CHECK',
message=f"Counter '{counter_name}' is {value}, expected 0.",
details={
'counter_name': counter_name,
'actual_value': value,
'expected_value': 0
})

return ValidationResult(ValidationStatus.PASSED,
'COUNTER_ZERO_CHECK',
details={
'counter_name': counter_name,
'actual_value': value,
'expected_value': 0
})

def validate_counter_max_threshold(self, counters: dict,
params: dict) -> ValidationResult:
"""Checks that a specific counter value does not exceed a maximum threshold.

Args:
counters: A dictionary containing the counter values.
params: A dictionary containing the validation parameters, which must
have 'counter_name' and 'threshold' keys.

Returns:
A ValidationResult object.
"""
if 'counter_name' not in params or 'threshold' not in params:
return ValidationResult(
ValidationStatus.CONFIG_ERROR,
'COUNTER_MAX_THRESHOLD',
message=
"Configuration error: 'counter_name' and 'threshold' must be specified."
)

counter_name = params['counter_name']
threshold = params['threshold']
value = counters.get(counter_name, 0)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider casting the counter value to a numeric type for robustness against string values in the JSON report. Avoid using defensive default values for missing keys to ensure input correctness.

Suggested change
value = counters.get(counter_name, 0)
value = float(counters[counter_name])
References
  1. When a configuration dictionary like resource_limits is provided, it is acceptable to assume it contains all required keys and allow it to fail on a KeyError if a key is missing, rather than defensively using default values. This enforces configuration correctness.


if value > threshold:
return ValidationResult(
ValidationStatus.FAILED,
'COUNTER_MAX_THRESHOLD',
message=
f"Counter '{counter_name}' is {value}, which exceeds the threshold of {threshold}.",
details={
'counter_name': counter_name,
'actual_value': value,
'threshold': threshold
})

return ValidationResult(ValidationStatus.PASSED,
'COUNTER_MAX_THRESHOLD',
details={
'counter_name': counter_name,
'actual_value': value,
'threshold': threshold
})

def validate_counter_ratio_threshold(self, counters: dict,
params: dict) -> ValidationResult:
"""Checks that the ratio of two counters (percentage) does not exceed a threshold.

Args:
counters: A dictionary containing the counter values.
params: A dictionary containing the validation parameters, which must
have 'subset_counter', 'total_counter', and 'threshold_percent' keys.

Returns:
A ValidationResult object.
"""
if 'subset_counter' not in params or 'total_counter' not in params or 'threshold_percent' not in params:
return ValidationResult(
ValidationStatus.CONFIG_ERROR,
'COUNTER_RATIO_THRESHOLD',
message=
"Configuration error: 'subset_counter', 'total_counter', and 'threshold_percent' must be specified."
)

subset_counter = params['subset_counter']
total_counter = params['total_counter']
threshold_percent = params['threshold_percent']

subset_value = counters.get(subset_counter, 0)
total_value = counters.get(total_counter, 0)
Comment on lines +1144 to +1145

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider casting the counter values to numeric types for robustness and to ensure correct division and comparison. Avoid using defensive default values for missing keys to ensure the validation fails if expected data is absent.

Suggested change
subset_value = counters.get(subset_counter, 0)
total_value = counters.get(total_counter, 0)
subset_value = float(counters[subset_counter])
total_value = float(counters[total_counter])
References
  1. When a configuration dictionary like resource_limits is provided, it is acceptable to assume it contains all required keys and allow it to fail on a KeyError if a key is missing, rather than defensively using default values. This enforces configuration correctness.


if total_value == 0:
if subset_value > 0:
percent = 100.0
else:
percent = 0.0
else:
percent = (subset_value / total_value) * 100

if percent > threshold_percent:
return ValidationResult(
ValidationStatus.FAILED,
'COUNTER_RATIO_THRESHOLD',
message=
f"Ratio of '{subset_counter}' to '{total_counter}' is {percent:.2f}%, which exceeds the threshold of {threshold_percent}%.",
details={
'subset_counter': subset_counter,
'subset_value': subset_value,
'total_counter': total_counter,
'total_value': total_value,
'percent': percent,
'threshold_percent': threshold_percent
})

return ValidationResult(ValidationStatus.PASSED,
'COUNTER_RATIO_THRESHOLD',
details={
'subset_counter': subset_counter,
'subset_value': subset_value,
'total_counter': total_counter,
'total_value': total_value,
'percent': percent,
'threshold_percent': threshold_percent
})

def validate_counter_sum_integrity(self, counters: dict,
params: dict) -> ValidationResult:
"""Checks that the sum of constituent counters equals a total counter.

Args:
counters: A dictionary containing the counter values.
params: A dictionary containing the validation parameters, which must
have 'total_counter' and 'constituent_counters' keys.

Returns:
A ValidationResult object.
"""
if 'total_counter' not in params or 'constituent_counters' not in params:
return ValidationResult(
ValidationStatus.CONFIG_ERROR,
'COUNTER_SUM_INTEGRITY',
message=
"Configuration error: 'total_counter' and 'constituent_counters' must be specified."
)

total_counter = params['total_counter']
constituent_counters = params['constituent_counters']

total_value = counters.get(total_counter, 0)
constituent_sum = sum(
counters.get(c, 0) for c in constituent_counters)

Comment on lines +1205 to +1207

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider casting the counter values to numeric types to ensure the sum and comparison operations are robust against string values in the JSON report. Avoid using defensive default values for missing keys.

Suggested change
constituent_sum = sum(
counters.get(c, 0) for c in constituent_counters)
total_value = float(counters[total_counter])
constituent_sum = sum(
float(counters[c]) for c in constituent_counters)
References
  1. When a configuration dictionary like resource_limits is provided, it is acceptable to assume it contains all required keys and allow it to fail on a KeyError if a key is missing, rather than defensively using default values. This enforces configuration correctness.

if total_value != constituent_sum:
return ValidationResult(
ValidationStatus.FAILED,
'COUNTER_SUM_INTEGRITY',
message=
f"Sum of constituent counters ({constituent_sum}) does not equal total counter '{total_counter}' ({total_value}).",
details={
'total_counter': total_counter,
'total_value': total_value,
'constituent_counters': constituent_counters,
'constituent_sum': constituent_sum
})

return ValidationResult(ValidationStatus.PASSED,
'COUNTER_SUM_INTEGRITY',
details={
'total_counter': total_counter,
'total_value': total_value,
'constituent_counters': constituent_counters,
'constituent_sum': constituent_sum
})

def validate_counter_min_yield(self, counters: dict,
params: dict) -> ValidationResult:
"""Checks that a specific counter is above a minimum yield.

Args:
counters: A dictionary containing the counter values.
params: A dictionary containing the validation parameters, which must
have 'counter_name' and 'min_yield' keys.

Returns:
A ValidationResult object.
"""
if 'counter_name' not in params or 'min_yield' not in params:
return ValidationResult(
ValidationStatus.CONFIG_ERROR,
'COUNTER_MIN_YIELD',
message=
"Configuration error: 'counter_name' and 'min_yield' must be specified."
)

counter_name = params['counter_name']
min_yield = params['min_yield']
value = counters.get(counter_name, 0)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider casting the counter value to a numeric type for robustness. Avoid using defensive default values for missing keys.

Suggested change
value = counters.get(counter_name, 0)
value = float(counters[counter_name])
References
  1. When a configuration dictionary like resource_limits is provided, it is acceptable to assume it contains all required keys and allow it to fail on a KeyError if a key is missing, rather than defensively using default values. This enforces configuration correctness.


if value < min_yield:
return ValidationResult(
ValidationStatus.FAILED,
'COUNTER_MIN_YIELD',
message=
f"Counter '{counter_name}' is {value}, which is below the minimum yield of {min_yield}.",
details={
'counter_name': counter_name,
'actual_value': value,
'min_yield': min_yield
})

return ValidationResult(ValidationStatus.PASSED,
'COUNTER_MIN_YIELD',
details={
'counter_name': counter_name,
'actual_value': value,
'min_yield': min_yield
})

Loading
Loading