From 43c2986d0abd5994ce8b8063f782fa1a3dba74d9 Mon Sep 17 00:00:00 2001 From: mikhailkonstan Date: Tue, 11 Jul 2023 03:01:11 +0200 Subject: [PATCH 1/4] Added: Return top n columns metrics --- examples/valentine_top_columns_example.py | 37 +++++++++++ valentine/metrics/metrics.py | 80 ++++++++++++++++++++++- 2 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 examples/valentine_top_columns_example.py diff --git a/examples/valentine_top_columns_example.py b/examples/valentine_top_columns_example.py new file mode 100644 index 0000000..6c1eb06 --- /dev/null +++ b/examples/valentine_top_columns_example.py @@ -0,0 +1,37 @@ +import os +import pandas as pd +from valentine import valentine_match +from valentine.algorithms import SimilarityFlooding +import pprint + +from valentine.metrics.metrics import get_top_n_columns, get_top_n_columns_for_column + + +def main(): + # Load data using pandas + d1_path = os.path.join('data', 'authors1.csv') + d2_path = os.path.join('data', 'authors2.csv') + df1 = pd.read_csv(d1_path) + df2 = pd.read_csv(d2_path) + + # Instantiate matcher and run + matcher = SimilarityFlooding() + matches = valentine_match(df1, df2, matcher) + + # Find the top-n columns for all columns in dataframe1 (authors1.csv) + all_top_2_columns = get_top_n_columns(matches, 2) + authors_top_2_columns = get_top_n_columns_for_column(matches, 2, 'Authors') + + pp = pprint.PrettyPrinter(indent=4) + print("Found the following matches:") + pp.pprint(matches) + + print("Top 2 columns for each column:") + pp.pprint(all_top_2_columns) + + print("Top 2 columns for 'Authors' column in table 1:") + pp.pprint(authors_top_2_columns) + + +if __name__ == '__main__': + main() diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py index 007d414..9d25a70 100644 --- a/valentine/metrics/metrics.py +++ b/valentine/metrics/metrics.py @@ -1,4 +1,5 @@ import math +from itertools import chain from typing import Dict, Tuple, List @@ -28,7 +29,7 @@ def one_to_one_matches(matches: dict): matched[key[0]] = False matched[key[1]] = False - median = list(set_match_values)[math.ceil(len(set_match_values)/2)] + median = list(set_match_values)[math.ceil(len(set_match_values) / 2)] matches1to1 = dict() @@ -232,7 +233,7 @@ def precision_at_n_percent(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]] def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - golden_standard: List[Tuple[str, str]],): + golden_standard: List[Tuple[str, str]], ): """ Function that calculates the recall at the size of the ground truth. e.g. if the size of ground truth size is 10 then only the first 10 matches will be considered for @@ -254,3 +255,78 @@ def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str if tp + fn == 0: return 0 return tp / (tp + fn) + + +def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], n: int): + """ + Returns the top n columns (regarding similarity) for each column (applies to both tables) + Example output (n=2): { + ('table_1', 'Authors'): ['Access Type', 'Authors'], + ('table_2', 'Authors'): ['Authors', 'Cited by'] + ... + } + + Parameters + ---------- + matches : dict + Ranked list of matches from the match with higher similarity to lower + n : int + The maximum number of columns to return + + Returns + ------- + dict + A dictionary with its keys to be equal to the column names of the first dataframe and its values to be + a list of the top n columns + """ + + # Create an empty dictionary where each column holds a list of the top similar + unique_keys = list(set(chain.from_iterable(sub for sub in matches.keys()))) + top_columns = {} + for key in unique_keys: + top_columns[key] = list() + + # Iterate sort matches and add the columns to the dictionary + for column_a, column_b in sorted(matches): + if len(top_columns[column_a]) < n: + top_columns[column_a].append(column_b[1]) + + if len(top_columns[column_b]) < n: + top_columns[column_b].append(column_a[1]) + + return top_columns + + +def get_top_n_columns_for_column(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], + n: int, + column_name: str): + """ + Given the columns of a dataframe1, returns the top n columns (regarding similarity) in dataframe2 + + Parameters + ---------- + matches : dict + Ranked list of matches from the match with higher similarity to lower + n : int + The maximum number of columns to return + column_name : str + The column name in the first dataframe + + Returns + ------- + list + A list of the top n column names in dataframe2 + """ + + top_columns = list() + + # Iterate sort matches and add the columns to the dictionary + for column_a, column_b in sorted(matches): + if column_a[1] == column_name: + top_columns.append(column_b[1]) + + # End condition: The n columns names have been found + if len(top_columns) >= n: + break + + return top_columns From 6c8ac554fb5090c6d6e5ce01ab7207eeced9df18 Mon Sep 17 00:00:00 2001 From: mikhailkonstan Date: Tue, 11 Jul 2023 03:03:47 +0200 Subject: [PATCH 2/4] Updated: Comments --- examples/valentine_top_columns_example.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/valentine_top_columns_example.py b/examples/valentine_top_columns_example.py index 6c1eb06..a9333f1 100644 --- a/examples/valentine_top_columns_example.py +++ b/examples/valentine_top_columns_example.py @@ -20,6 +20,8 @@ def main(): # Find the top-n columns for all columns in dataframe1 (authors1.csv) all_top_2_columns = get_top_n_columns(matches, 2) + + # Find the top-n columns for the column 'Authors' in dataframe1 authors_top_2_columns = get_top_n_columns_for_column(matches, 2, 'Authors') pp = pprint.PrettyPrinter(indent=4) From ae8442d295a2fa1ab8c31c785b791e86eb6f9279 Mon Sep 17 00:00:00 2001 From: mikhailkonstan Date: Sun, 22 Oct 2023 19:19:59 +0200 Subject: [PATCH 3/4] Added: Score value in get_top_n_columns result --- examples/valentine_top_columns_example.py | 2 +- valentine/metrics/metrics.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/valentine_top_columns_example.py b/examples/valentine_top_columns_example.py index a9333f1..ec16a6c 100644 --- a/examples/valentine_top_columns_example.py +++ b/examples/valentine_top_columns_example.py @@ -28,7 +28,7 @@ def main(): print("Found the following matches:") pp.pprint(matches) - print("Top 2 columns for each column:") + print("Top 2 columns for each column (with their corresponding score):") pp.pprint(all_top_2_columns) print("Top 2 columns for 'Authors' column in table 1:") diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py index 9d25a70..aec73c6 100644 --- a/valentine/metrics/metrics.py +++ b/valentine/metrics/metrics.py @@ -277,7 +277,11 @@ def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], flo ------- dict A dictionary with its keys to be equal to the column names of the first dataframe and its values to be - a list of the top n columns + a list of dictionaries with the top n columns + + output example: + key: ('table_1', 'Authors') + value: [{'Access Type': 0.1515703989838858}, {'Authors': 0.2816471572126128}] """ # Create an empty dictionary where each column holds a list of the top similar @@ -288,11 +292,13 @@ def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], flo # Iterate sort matches and add the columns to the dictionary for column_a, column_b in sorted(matches): + score = matches[(column_a, column_b)] + if len(top_columns[column_a]) < n: - top_columns[column_a].append(column_b[1]) + top_columns[column_a].append({column_b[1]: score}) if len(top_columns[column_b]) < n: - top_columns[column_b].append(column_a[1]) + top_columns[column_b].append({column_a[1]: score}) return top_columns From 49768914797f14ed7a015ba0f7dfe3519ee1dafb Mon Sep 17 00:00:00 2001 From: mikhailkonstan Date: Sun, 22 Oct 2023 21:02:36 +0200 Subject: [PATCH 4/4] Refactored: get_top_n_columns accepts keys and get_top_n_columns_for_column has been deleted --- examples/valentine_top_columns_example.py | 9 +++- valentine/metrics/metrics.py | 62 +++++++---------------- 2 files changed, 26 insertions(+), 45 deletions(-) diff --git a/examples/valentine_top_columns_example.py b/examples/valentine_top_columns_example.py index ec16a6c..e7f608f 100644 --- a/examples/valentine_top_columns_example.py +++ b/examples/valentine_top_columns_example.py @@ -4,7 +4,7 @@ from valentine.algorithms import SimilarityFlooding import pprint -from valentine.metrics.metrics import get_top_n_columns, get_top_n_columns_for_column +from valentine.metrics.metrics import get_top_n_columns def main(): @@ -22,7 +22,9 @@ def main(): all_top_2_columns = get_top_n_columns(matches, 2) # Find the top-n columns for the column 'Authors' in dataframe1 - authors_top_2_columns = get_top_n_columns_for_column(matches, 2, 'Authors') + authors_top_2_columns = get_top_n_columns(matches, 2, [('table_1', 'Authors')]) + + authors_year_top_2_columns = get_top_n_columns(matches, 2, [('table_1', 'Authors'), ('table_1', 'Year')]) pp = pprint.PrettyPrinter(indent=4) print("Found the following matches:") @@ -34,6 +36,9 @@ def main(): print("Top 2 columns for 'Authors' column in table 1:") pp.pprint(authors_top_2_columns) + print("Top 2 columns for multiple keys. Namely, 'Authors' and 'Year' columns in table 1:") + pp.pprint(authors_year_top_2_columns) + if __name__ == '__main__': main() diff --git a/valentine/metrics/metrics.py b/valentine/metrics/metrics.py index aec73c6..1f19238 100644 --- a/valentine/metrics/metrics.py +++ b/valentine/metrics/metrics.py @@ -257,12 +257,15 @@ def recall_at_sizeof_ground_truth(matches: Dict[Tuple[Tuple[str, str], Tuple[str return tp / (tp + fn) -def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], n: int): +def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], + n: int, + keys: List[Tuple[str, str]] = None): """ - Returns the top n columns (regarding similarity) for each column (applies to both tables) + Returns the top n columns (regarding similarity) for each column (applies to both tables) with their + corresponding score Example output (n=2): { - ('table_1', 'Authors'): ['Access Type', 'Authors'], - ('table_2', 'Authors'): ['Authors', 'Cited by'] + ('table_1', 'Authors'): [{'Access Type': 0.1515703989838858},{'Authors': 0.2816471572126128}], + ('table_2', 'Authors'): [{'Authors': 0.1515703989838858}, {'Cited by': 0.2816471572126128}] ... } @@ -272,6 +275,9 @@ def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], flo Ranked list of matches from the match with higher similarity to lower n : int The maximum number of columns to return + keys : Tuple[str, str] + If specified, it will only return the top n columns for the given keys + Example : [('table_1', 'Authors'), ('table_1', 'Access Type')] Returns ------- @@ -284,55 +290,25 @@ def get_top_n_columns(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], flo value: [{'Access Type': 0.1515703989838858}, {'Authors': 0.2816471572126128}] """ + # Identify the keys of the top columns that are going to be returned. Use from parameters or get all unique keys + if keys is None: + keys = list(set(chain.from_iterable(sub for sub in matches.keys()))) + # Create an empty dictionary where each column holds a list of the top similar - unique_keys = list(set(chain.from_iterable(sub for sub in matches.keys()))) top_columns = {} - for key in unique_keys: + for key in keys: top_columns[key] = list() # Iterate sort matches and add the columns to the dictionary for column_a, column_b in sorted(matches): score = matches[(column_a, column_b)] - if len(top_columns[column_a]) < n: + # Check whether column_a is of any interest and whether the top_n_columns are already present in the list + if (column_a in keys) and (len(top_columns[column_a]) < n): top_columns[column_a].append({column_b[1]: score}) - if len(top_columns[column_b]) < n: + # Check whether column_b is of any interest and whether the top_n_columns are already present in the list + if (column_b in keys) and (len(top_columns[column_b]) < n): top_columns[column_b].append({column_a[1]: score}) return top_columns - - -def get_top_n_columns_for_column(matches: Dict[Tuple[Tuple[str, str], Tuple[str, str]], float], - n: int, - column_name: str): - """ - Given the columns of a dataframe1, returns the top n columns (regarding similarity) in dataframe2 - - Parameters - ---------- - matches : dict - Ranked list of matches from the match with higher similarity to lower - n : int - The maximum number of columns to return - column_name : str - The column name in the first dataframe - - Returns - ------- - list - A list of the top n column names in dataframe2 - """ - - top_columns = list() - - # Iterate sort matches and add the columns to the dictionary - for column_a, column_b in sorted(matches): - if column_a[1] == column_name: - top_columns.append(column_b[1]) - - # End condition: The n columns names have been found - if len(top_columns) >= n: - break - - return top_columns