diff --git a/scripts/world_bank/wdi/README.md b/scripts/world_bank/wdi/README.md index ef1f0f5dc6..239d0f7277 100644 --- a/scripts/world_bank/wdi/README.md +++ b/scripts/world_bank/wdi/README.md @@ -146,5 +146,24 @@ If you want to perform "only download", run the below command: python3 worldbank.py --mode=download ``` +### Added golden files and increased the threshold with golden checks in validation_config.json. + +The `GOLDENS_CHECK` validator confirms that the import includes a specific set of expected records. This is useful for verifying that critical StatVars, Places, or specific metadata combinations are consistently present in the output. + +The validator compares the input data (usually from the stats data source) against one or more "golden" files (MCF or CSV). + +If any combination of values in a golden file row is missing from the input, the validation fails. The missing golden rows are then listed in the validation report JSON. + +If you want to get goldens, run the below command: +```bash +#goldens from output csv +python3 validator_goldens.py --validate_goldens_input=../../scripts/world_bank/wdi/output/WorldBank.csv --generate_goldens=golden_data/golden_observations.csv --goldens_must_include="ISO3166Alpha3:gs://unresolved_mcf/import_validation/top_100k_places.csv" --generate_goldens_property_sets="ISO3166Alpha3" +``` + +#goldens from summary reports +```bash +python3 validator_goldens.py --validate_goldens_input="summary_report.csv" --generate_goldens=golden_data/golden_summary_report.csv --generate_goldens_property_sets="StatVar|Units|MinDate|MeasurementMethods|observationPeriod" +``` + We highly recommend the use of the import validation tool for this import which you can find in https://github.com/datacommonsorg/tools/tree/master/import-validation-helper. diff --git a/scripts/world_bank/wdi/golden_data/golden_summary_report.csv b/scripts/world_bank/wdi/golden_data/golden_summary_report.csv new file mode 100644 index 0000000000..bf2d3335a1 --- /dev/null +++ b/scripts/world_bank/wdi/golden_data/golden_summary_report.csv @@ -0,0 +1,71 @@ +"NumPlaces","StatVar","ScalingFactors","MeasurementMethods","Units","observationPeriods","MinDate" +"186","Count_Death_IntentionalSelfHarm_Male_AsFractionOf_Count_Person_Male","[]","[]","[Per100000Males]","[P1Y]","2000" +"203","Amount_EconomicActivity_GrossNationalIncome_PurchasingPowerParity","[]","[]","[InternationalDollar]","[P1Y]","1990" +"165","Count_Person_Upto4Years_Wasting_AsFractionOf_Count_Person_Upto4Years","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1983" +"144","Count_Person_25OrMoreYears_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears","[]","[]","[]","[P1Y]","1994" +"204","Amount_Emissions_CarbonDioxide_PerCapita","[]","[]","[MetricTon]","[P1Y]","1970" +"184","Count_Person_25OrMoreYears_Male_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears_Male","[]","[]","[]","[P1Y]","1970" +"218","LifeExpectancy_Person_Female","[]","[]","[Year]","[P1Y]","1960" +"139","Count_Person_25OrMoreYears_Male_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears_Male","[]","[]","[]","[P1Y]","1994" +"197","Count_CriminalActivities_MurderAndNonNegligentManslaughter_AsFractionOf_Count_Person","[]","[]","[Per100000Persons]","[P1Y]","1990" +"194","Amount_EconomicActivity_ExpenditureActivity_HealthcareExpenditure_AsFractionOf_Count_Person","[]","[]","[InternationalDollar, USDollar]","[P1Y]","2000" +"202","Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_ExpenditureActivity_Government","[100]","[]","[Percent]","[P1Y]","1980" +"188","Count_Person_25OrMoreYears_Male_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Male","[]","[]","[]","[P1Y]","1970" +"218","FertilityRate_Person_Female","[]","[]","[]","[]","1960" +"218","Count_Person_Rural","[]","[WorldBankEstimate]","[]","[P1Y]","1960" +"183","Count_Person_25OrMoreYears_Female_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears_Female","[]","[]","[]","[P1Y]","1970" +"218","Count_Person_Urban","[]","[WorldBankEstimate]","[]","[P1Y]","1960" +"165","Count_Person_Upto4Years_Overweight_AsFractionOf_Count_Person_Upto4Years","[]","[]","[]","[P1Y]","1983" +"218","LifeExpectancy_Person_Male","[]","[]","[Year]","[P1Y]","1960" +"218","Count_BirthEvent_LiveBirth_AsFractionOf_Count_Person","[]","[]","[Per1000Persons]","[P1Y]","1960" +"197","MortalityRate_Person_Upto4Years_AsFractionOf_Count_BirthEvent_LiveBirth","[]","[]","[Per1000LiveBirths]","[P1Y]","1960" +"218","Count_Person","[]","[]","[]","[P1Y]","1960" +"160","Count_Person_Upto4Years_Male_Wasting_AsFractionOf_Count_Person_Upto4Years_Male","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1986" +"204","Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[100]","[]","[Percent]","[P1Y]","1970" +"188","Count_Person_25OrMoreYears_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears","[]","[]","[]","[P1Y]","1970" +"165","Count_Person_15OrMoreYears_Female_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Female","[]","[AgeAdjustedPrevalence]","[]","[P1Y]","2000" +"165","Count_Person_15OrMoreYears_Smoking_AsFractionOf_Count_Person_15OrMoreYears","[]","[AgeAdjustedPrevalence]","[]","[P1Y]","2000" +"203","Amount_EconomicActivity_GrossNationalIncome_PurchasingPowerParity_PerCapita","[]","[]","[InternationalDollar]","[P1Y]","1990" +"160","Count_Person_Upto4Years_Male_Overweight_AsFractionOf_Count_Person_Upto4Years_Male","[]","[]","[]","[P1Y]","1986" +"195","Amount_EconomicActivity_ExpenditureActivity_TertiaryEducationExpenditure_Government_AsFractionOf_Amount_EconomicActivity_ExpenditureActivity_EducationExpenditure_Government","[]","[]","[]","[P1Y]","1970" +"159","Count_Person_Upto4Years_Male_SevereWasting_AsFractionOf_Count_Person_Upto4Years_Male","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1986" +"151","Amount_Consumption_Electricity_PerCapita","[]","[]","[KilowattHour]","[P1Y]","1990" +"180","Amount_Consumption_Energy_PerCapita","[]","[]","[KilogramOfOilEquivalent]","[P1Y]","1990" +"186","Count_Death_IntentionalSelfHarm_Female_AsFractionOf_Count_Person_Female","[]","[]","[Per100000Females]","[P1Y]","2000" +"165","Count_Person_15OrMoreYears_Male_Smoking_AsFractionOf_Count_Person_15OrMoreYears_Male","[]","[AgeAdjustedPrevalence]","[]","[P1Y]","2000" +"149","Count_CriminalActivities_MurderAndNonNegligentManslaughter_Male_AsFractionOf_Count_Person_Male","[]","[]","[Per100000Males]","[P1Y]","1990" +"200","Amount_Remittance_InwardRemittance_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[100]","[WorldBankEstimate]","[Percent]","[P1Y]","1970" +"188","Count_Person_15To64Years_InLaborForce_AsFractionOf_Count_Person_15To64Years","[]","[]","[]","[P1Y]","1990" +"171","GiniIndex_EconomicActivity","[]","[WorldBankEstimate]","[]","[P1Y]","1963" +"162","Count_Person_25OrMoreYears_Female_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Female","[]","[]","[]","[P1Y]","1990" +"170","Count_Person_25OrMoreYears_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears","[]","[]","[]","[P1Y]","1990" +"152","Count_CriminalActivities_MurderAndNonNegligentManslaughter_Female_AsFractionOf_Count_Person_Female","[]","[]","[Per100000Females]","[P1Y]","1990" +"188","Count_Person_15To64Years_Female_InLaborForce_AsFractionOf_Count_Person_15To64Years_Female","[]","[]","[]","[P1Y]","1990" +"104","Amount_Stock_AsFractionOf_Amount_EconomicActivity_GrossDomesticProduction_Nominal","[100]","[]","[Percent]","[P1Y]","1975" +"131","Count_Person_25OrMoreYears_Female_DoctorateDegree_AsFractionOf_Count_Person_25OrMoreYears_Female","[]","[]","[]","[P1Y]","1994" +"215","GrowthRate_Amount_EconomicActivity_GrossDomesticProduction","[]","[]","[]","[P1Y]","1961" +"218","Count_Death_AsAFractionOfCount_Person","[]","[WorldBankWeightedAverage]","[Per1000Persons]","[P1Y]","1960" +"215","Amount_EconomicActivity_GrossDomesticProduction_Nominal","[]","[]","[USDollar]","[P1Y]","1960" +"188","Count_Person_15To64Years_Male_InLaborForce_AsFractionOf_Count_Person_15To64Years_Male","[]","[]","[]","[P1Y]","1990" +"200","Amount_Remittance_InwardRemittance","[]","[WorldBankEstimate]","[USDollar]","[P1Y]","1970" +"161","Count_Person_Upto4Years_SevereWasting_AsFractionOf_Count_Person_Upto4Years","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1983" +"188","Count_Person_25OrMoreYears_Female_BachelorsDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Female","[]","[]","[]","[P1Y]","1970" +"167","Count_Person_25OrMoreYears_Male_MastersDegreeOrHigher_AsFractionOf_Count_Person_25OrMoreYears_Male","[]","[]","[]","[P1Y]","1990" +"188","Amount_Consumption_Alcohol_15OrMoreYears_AsFractionOf_Count_Person_15OrMoreYears","[]","[WorldHealthOrganizationEstimates]","[Liter]","[P1Y]","2000" +"188","Count_Person_15OrMoreYears_InLaborForce_Female_AsFractionOf_Count_Person_InLaborForce","[]","[]","[]","[P1Y]","1990" +"215","Count_Product_MobileCellularSubscription_AsFractionOf_Count_Person","[]","[]","[]","[P1Y]","1960" +"188","Count_Person_InLaborForce","[]","[InternationalLaborOrganization]","[]","[P1Y]","1990" +"186","Count_Death_IntentionalSelfHarm_AsFractionOf_Count_Person","[]","[]","[Per100000Persons]","[P1Y]","2000" +"197","Count_Death_0Years_AsFractionOf_Count_BirthEvent_LiveBirth","[]","[UnitedNationsIGMEEstimate]","[Per1000LiveBirths]","[P1Y]","1960" +"160","Count_Person_Upto4Years_Female_Wasting_AsFractionOf_Count_Person_Upto4Years_Female","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1986" +"203","Amount_Remittance_OutwardRemittance","[]","[WorldBankEstimate]","[USDollar]","[P1Y]","1970" +"160","Count_Person_Upto4Years_Female_Overweight_AsFractionOf_Count_Person_Upto4Years_Female","[]","[]","[]","[P1Y]","1986" +"214","Count_Person_IsInternetUser_PerCapita","[100]","[]","[]","[P1Y]","1990" +"210","Amount_Production_ElectricityFromNuclearSources_AsFractionOf_Amount_Production_Energy","[]","[]","[]","[P1Y]","1990" +"159","Count_Person_Upto4Years_Female_SevereWasting_AsFractionOf_Count_Person_Upto4Years_Female","[100]","[JointChildMalnutritionEstimate]","[Percent]","[P1Y]","1986" +"184","Count_Person_25OrMoreYears_TertiaryEducation_AsFractionOf_Count_Person_25OrMoreYears","[]","[]","[]","[P1Y]","1970" +"210","Amount_Production_ElectricityFromOilGasOrCoalSources_AsFractionOf_Amount_Production_Energy","[]","[]","[]","[P1Y]","1990" +"218","GrowthRate_Count_Person","[]","[]","[]","[P1Y]","1961" +"213","Amount_Consumption_RenewableEnergy_AsFractionOf_Amount_Consumption_Energy","[]","[]","[]","[P1Y]","1990" +"104","Amount_Stock","[]","[]","[USDollar]","[P1Y]","1975" +"218","LifeExpectancy_Person","[]","[]","[Year]","[]","1960" diff --git a/scripts/world_bank/wdi/manifest.json b/scripts/world_bank/wdi/manifest.json index bc3927141e..eb427c0472 100644 --- a/scripts/world_bank/wdi/manifest.json +++ b/scripts/world_bank/wdi/manifest.json @@ -20,7 +20,8 @@ "WorldBankCountries.csv", "schema_csvs/WorldBankIndicators_prod.csv" ], - "cron_schedule": "0 11 * * 2" + "cron_schedule": "0 11 * * 2", + "validation_config_file": "validation_config.json" } ] } \ No newline at end of file diff --git a/scripts/world_bank/wdi/validation_config.json b/scripts/world_bank/wdi/validation_config.json new file mode 100644 index 0000000000..35b5b0baa0 --- /dev/null +++ b/scripts/world_bank/wdi/validation_config.json @@ -0,0 +1,20 @@ +{ + "schema_version": "1.0", + "rules": [ + { + "rule_id": "check_deleted_records_percent", + "description": "Checks that the percentage of deleted points is within the threshold.", + "validator": "DELETED_RECORDS_PERCENT", + "params": { + "threshold": 0.61 + } + }, + { + "rule_id": "check_goldens_summary_report", + "validator": "GOLDENS_CHECK", + "params": { + "golden_files": "golden_data/golden_summary_report.csv" + } + } + ] +} \ No newline at end of file diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index f1364518e6..777336a165 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -14,6 +14,7 @@ """Module for the ValidationRunner class.""" import os +import logging from absl import app from absl import flags from absl import logging @@ -34,6 +35,55 @@ _FLAGS = flags.FLAGS +def _is_relative_local(path_val: str) -> bool: + """Checks if a path is a relative, local file path. + + This function identifies path strings that represent local relative files + (e.g., 'golden_data/un_wpp.csv') as opposed to absolute paths. It filters + out non-strings, empty strings, and absolute local paths. + + Args: + path_val: The file path string to evaluate. + + Returns: + True if the path represents a relative, local file path; False otherwise. + """ + if not isinstance(path_val, str) or not path_val: + return False + return not os.path.isabs(path_val) + + +def _find_base_dir(start_path: str, target_sub_path: str) -> str | None: + """Helper to find a base directory containing a target sub-path by walking up. + + Starting from the absolute directory of `start_path`, this function recursively + checks if `target_sub_path` exists in the current folder. If not, it walks up the + parent directory tree up to 10 levels. This is crucial for resolving paths relative + to import-specific golden directories when tests/validation are run from + different working directories (such as the repository root in CI/CD). + + Args: + start_path: The file or directory path to start the upward search from. + target_sub_path: The name of the subdirectory or file (e.g., 'golden_data') + to search for within the parent tree. + + Returns: + The absolute path of the directory containing `target_sub_path` if found, + or None if the root was reached or the 10-level limit was exceeded. + """ + if not start_path: + return None + curr = os.path.abspath(start_path) + for _ in range(8): # limit to 10 levels up + if os.path.exists(os.path.join(curr, target_sub_path)): + return curr + parent = os.path.dirname(curr) + if parent == curr: + break + curr = parent + return None + + class ValidationRunner: """ Orchestrates the validation process based on the new schema. @@ -41,6 +91,8 @@ class ValidationRunner: def __init__(self, validation_config_path: str, differ_output: str, stats_summary: str, lint_report: str, validation_output: str): + self.validation_config_path = validation_config_path + self.stats_summary = stats_summary self.config = ValidationConfig(validation_config_path) self.validation_output = validation_output self.validator = Validator() @@ -212,6 +264,50 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: if output_dir: rule_params.setdefault('output_path', output_dir) + # Resolve paths relative to the directory of the validation config. + if validator_name == 'GOLDENS_CHECK': + config_dir = None + # Walk up from validation_config_path, self.stats_summary, or CWD to find where 'golden_data' lives + for start in [ + self.validation_config_path, self.stats_summary, + os.getcwd() + ]: + config_dir = _find_base_dir(start, 'golden_data') + if config_dir: + break + + if not config_dir: + config_dir = os.path.dirname( + os.path.abspath(self.validation_config_path)) + + print( + f"DEBUG: Found GOLDENS_CHECK rule: '{rule.get('rule_id')}'" + ) + print( + f"DEBUG: Config directory resolved to: '{config_dir}'") + for path_key in list(rule_params.keys()): + # Check any key in rule_params that equals 'golden_files' or 'input_files' or ends with '_file' or '_files' + if path_key in ( + 'golden_files', + 'input_files') or path_key.endswith( + '_file') or path_key.endswith('_files'): + val = rule_params[path_key] + print( + f"DEBUG: Before resolve '{path_key}': '{val}'") + if isinstance(val, str): + if _is_relative_local(val): + rule_params[path_key] = os.path.join( + config_dir, val) + elif isinstance(val, list): + rule_params[path_key] = [ + os.path.join(config_dir, item) + if _is_relative_local(item) else item + for item in val + ] + print( + f"DEBUG: After resolve '{path_key}': '{rule_params[path_key]}'" + ) + if validator_name == 'SQL_VALIDATOR': result = validation_func(self.data_sources['stats'], self.data_sources['differ'], diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 7b19b783fe..54653367a8 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -68,6 +68,7 @@ --generate_goldens=goldens_data/generated_goldens.csv """ +import csv import os import sys import tempfile @@ -298,7 +299,12 @@ def load_nodes_from_file(files: str) -> dict: file_nodes = file_util.file_load_csv_dict(input_file, key_index=True) for node in file_nodes.values(): - nodes[len(nodes)] = node + # Clean up "dcid:" prefixes from values (column headers are kept as is) + clean_node = {} + for k, v in node.items(): + clean_val = v[5:] if (isinstance(v, str) and v.startswith("dcid:")) else v + clean_node[k] = clean_val + nodes[len(nodes)] = clean_node else: # For MCF or JSON, we assume nodes are already keyed by DCID. file_nodes = mcf_file_util.load_mcf_nodes(input_file) @@ -311,7 +317,6 @@ def load_nodes_from_file(files: str) -> dict: logging.info(f'Loaded {len(nodes)} nodes from {input_files}') return nodes - def generate_goldens(input_files: str, property_sets: list, output_file: str = None, @@ -440,9 +445,23 @@ def generate_goldens(input_files: str, if golden_nodes and output_file: logging.info(f'Writing {len(golden_nodes)} goldens to {output_file}') if file_util.file_is_csv(output_file): - file_util.file_write_csv_dict(golden_nodes, - output_file, - key_column_name=None) + headers = [] + for node in golden_nodes.values(): + for prop in node.keys(): + if prop not in headers: + headers.append(prop) + with file_util.FileIO(output_file, mode='w') as csvfile: + writer = csv.DictWriter( + csvfile, + fieldnames=headers, + escapechar='\\', + extrasaction='ignore', + quotechar='"', + quoting=csv.QUOTE_NONNUMERIC, + ) + writer.writeheader() + for node in golden_nodes.values(): + writer.writerow(node) else: mcf_file_util.write_mcf_nodes([golden_nodes], output_file)