From 92ea2fd3745d0c39c87698fa03c6c7d73dd1f3a7 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:07:26 -0800 Subject: [PATCH 1/5] ref: don't pass DataModel into BlockLearners It's not needed in there, so should just pass what is needed. This is prep for further refactoring of removing datamodel more --- dedupe/labeler.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dedupe/labeler.py b/dedupe/labeler.py index 5942497a..fcdf4759 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -216,7 +216,7 @@ def _filter_canopy_predicates( class DedupeBlockLearner(BlockLearner): def __init__( self, - data_model: DataModel, + candidate_predicates: Iterable[Predicate], data: Data, index_include: TrainingExamples, ): @@ -228,7 +228,7 @@ def __init__( index_data = sample_records(data, 50000) sampled_records = sample_records(index_data, N_SAMPLED_RECORDS) - preds = _filter_canopy_predicates(data_model.predicates, canopies=True) + preds = _filter_canopy_predicates(candidate_predicates, canopies=True) self.block_learner = training.DedupeBlockLearner( preds, sampled_records, index_data ) @@ -268,7 +268,7 @@ def _sample(self, data: Data, sample_size: int) -> TrainingExamples: class RecordLinkBlockLearner(BlockLearner): def __init__( self, - data_model: DataModel, + candidate_predicates: Iterable[Predicate], data_1: Data, data_2: Data, index_include: TrainingExamples, @@ -282,7 +282,7 @@ def __init__( index_data = sample_records(data_2, 50000) sampled_records_2 = sample_records(index_data, N_SAMPLED_RECORDS) - preds = _filter_canopy_predicates(data_model.predicates, canopies=False) + preds = _filter_canopy_predicates(candidate_predicates, canopies=False) self.block_learner = training.RecordLinkBlockLearner( preds, sampled_records_1, sampled_records_2, index_data ) @@ -400,7 +400,7 @@ def __init__( index_include = index_include.copy() index_include.append(exact_match) - self.blocker = DedupeBlockLearner(data_model, data, index_include) + self.blocker = DedupeBlockLearner(data_model.predicates, data, index_include) self._candidates = self.blocker.candidates.copy() @@ -435,7 +435,9 @@ def __init__( index_include = index_include.copy() index_include.append(exact_match) - self.blocker = RecordLinkBlockLearner(data_model, data_1, data_2, index_include) + self.blocker = RecordLinkBlockLearner( + data_model.predicates, data_1, data_2, index_include + ) self._candidates = self.blocker.candidates.copy() self.matcher = MatchLearner(self.data_model, self.candidates) From cd54a9db727b017d6479fa9d180c49f73fa774c8 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:15:23 -0800 Subject: [PATCH 2/5] ref: don't store DataModel in DisagreementLearner We don't use it after the initial construction, so don't store it --- dedupe/labeler.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dedupe/labeler.py b/dedupe/labeler.py index fcdf4759..5a50e3e8 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -324,8 +324,7 @@ class DisagreementLearner(HasCandidates): matcher: MatchLearner blocker: BlockLearner - def __init__(self, data_model: DataModel) -> None: - self.data_model = data_model + def __init__(self) -> None: self.y: numpy.typing.NDArray[numpy.int_] = numpy.array([]) self.pairs: TrainingExamples = [] @@ -387,8 +386,7 @@ def __init__( data: Data, index_include: TrainingExamples, ): - super().__init__(data_model) - + super().__init__() data = core.index(data) random_pair = ( @@ -404,7 +402,7 @@ def __init__( self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(self.data_model, self.candidates) + self.matcher = MatchLearner(data_model, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] @@ -419,8 +417,7 @@ def __init__( data_2: Data, index_include: TrainingExamples, ): - super().__init__(data_model) - + super().__init__() data_1 = core.index(data_1) offset = len(data_1) @@ -440,7 +437,7 @@ def __init__( ) self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(self.data_model, self.candidates) + self.matcher = MatchLearner(data_model, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] From c460783ce64fc720167b450fd9c556470e982203 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:18:50 -0800 Subject: [PATCH 3/5] ref: Don't require DataModel in MatchLearner We only need the abstract requirement of a Featurizer function. --- dedupe/_typing.py | 6 ++++++ dedupe/labeler.py | 26 +++++++++----------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/dedupe/_typing.py b/dedupe/_typing.py index 1ab25018..b2e6d7e7 100644 --- a/dedupe/_typing.py +++ b/dedupe/_typing.py @@ -80,6 +80,12 @@ class TrainingData(TypedDict): distinct: List[RecordDictPair] +# Takes pairs of records and generates a (n_samples X n_features) array +FeaturizerFunction = Callable[ + [Sequence[RecordDictPair]], numpy.typing.NDArray[numpy.float_] +] + + class Classifier(Protocol): """Takes an array of pairwise distances and computes the likelihood they are a pair.""" diff --git a/dedupe/labeler.py b/dedupe/labeler.py index 5a50e3e8..96525de8 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: from typing import Dict, Iterable, Literal, Mapping - from dedupe._typing import Data, Labels, LabelsLike + from dedupe._typing import Data, FeaturizerFunction, Labels, LabelsLike from dedupe._typing import RecordDictPair as TrainingExample from dedupe._typing import RecordDictPairs as TrainingExamples from dedupe._typing import RecordIDPair @@ -70,33 +70,25 @@ def _verify_fit_args(pairs: TrainingExamples, y: LabelsLike) -> list[Literal[0, class MatchLearner(Learner): - def __init__(self, data_model: DataModel, candidates: TrainingExamples): - self.data_model = data_model + def __init__(self, featurizer: FeaturizerFunction, candidates: TrainingExamples): + self._featurizer = featurizer self._candidates = candidates.copy() self._classifier = sklearn.linear_model.LogisticRegression() - self._distances = self._calc_distances(self.candidates) + self._features = self._featurizer(self.candidates) def fit(self, pairs: TrainingExamples, y: LabelsLike) -> None: y = self._verify_fit_args(pairs, y) - self._classifier.fit(self._calc_distances(pairs), numpy.array(y)) + self._classifier.fit(self._featurizer(pairs), numpy.array(y)) self._fitted = True def remove(self, index: int) -> None: self._candidates.pop(index) - self._distances = numpy.delete(self._distances, index, axis=0) + self._features = numpy.delete(self._features, index, axis=0) def candidate_scores(self) -> numpy.typing.NDArray[numpy.float_]: if not self._fitted: raise ValueError("Must call fit() before candidate_scores()") - scores: numpy.typing.NDArray[numpy.float_] = self._classifier.predict_proba( - self._distances - )[:, 1].reshape(-1, 1) - return scores - - def _calc_distances( - self, pairs: TrainingExamples - ) -> numpy.typing.NDArray[numpy.float_]: - return self.data_model.distances(pairs) + return self._classifier.predict_proba(self._features)[:, 1].reshape(-1, 1) class BlockLearner(Learner): @@ -402,7 +394,7 @@ def __init__( self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(data_model, self.candidates) + self.matcher = MatchLearner(data_model.distances, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] @@ -437,7 +429,7 @@ def __init__( ) self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(data_model, self.candidates) + self.matcher = MatchLearner(data_model.distances, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] From 135ca94066306f16091f6abc7440dba2f2cae563 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:38:29 -0800 Subject: [PATCH 4/5] ref: Remove use of DataModel from active learners --- dedupe/api.py | 6 ++++-- dedupe/labeler.py | 15 ++++++++------- tests/test_labeler.py | 4 +++- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index fe701fff..707be427 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -1325,7 +1325,8 @@ def prepare_training( examples, y = flatten_training(self.training_pairs) self.active_learner = labeler.DedupeDisagreementLearner( - self.data_model, + self.data_model.predicates, + self.data_model.distances, data, index_include=examples, ) @@ -1392,7 +1393,8 @@ def prepare_training( examples, y = flatten_training(self.training_pairs) self.active_learner = labeler.RecordLinkDisagreementLearner( - self.data_model, + self.data_model.predicates, + self.data_model.distances, data_1, data_2, index_include=examples, diff --git a/dedupe/labeler.py b/dedupe/labeler.py index 96525de8..3882acd7 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -19,7 +19,6 @@ from dedupe._typing import RecordDictPair as TrainingExample from dedupe._typing import RecordDictPairs as TrainingExamples from dedupe._typing import RecordIDPair - from dedupe.datamodel import DataModel from dedupe.predicates import Predicate @@ -374,7 +373,8 @@ def learn_predicates( class DedupeDisagreementLearner(DisagreementLearner): def __init__( self, - data_model: DataModel, + candidate_predicates: Iterable[Predicate], + featurizer: FeaturizerFunction, data: Data, index_include: TrainingExamples, ): @@ -390,11 +390,11 @@ def __init__( index_include = index_include.copy() index_include.append(exact_match) - self.blocker = DedupeBlockLearner(data_model.predicates, data, index_include) + self.blocker = DedupeBlockLearner(candidate_predicates, data, index_include) self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(data_model.distances, self.candidates) + self.matcher = MatchLearner(featurizer, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] @@ -404,7 +404,8 @@ def __init__( class RecordLinkDisagreementLearner(DisagreementLearner): def __init__( self, - data_model: DataModel, + candidate_predicates: Iterable[Predicate], + featurizer: FeaturizerFunction, data_1: Data, data_2: Data, index_include: TrainingExamples, @@ -425,11 +426,11 @@ def __init__( index_include.append(exact_match) self.blocker = RecordLinkBlockLearner( - data_model.predicates, data_1, data_2, index_include + candidate_predicates, data_1, data_2, index_include ) self._candidates = self.blocker.candidates.copy() - self.matcher = MatchLearner(data_model.distances, self.candidates) + self.matcher = MatchLearner(featurizer, self.candidates) examples = [exact_match] * 4 + [random_pair] labels: Labels = [1] * 4 + [0] # type: ignore[assignment] diff --git a/tests/test_labeler.py b/tests/test_labeler.py index 7f9c8df2..30609ffa 100644 --- a/tests/test_labeler.py +++ b/tests/test_labeler.py @@ -43,7 +43,9 @@ def test_AL(self): ({"name": "William", "age": "35"}, {"name": "Jimbo", "age": "21"}), ] EXPECTED_CANDIDATES = {freeze_record_pair(pair) for pair in EXPECTED_CANDIDATES} - active_learner = labeler.DedupeDisagreementLearner(self.data_model, SAMPLE, []) + active_learner = labeler.DedupeDisagreementLearner( + self.data_model.predicates, self.data_model.distances, SAMPLE, [] + ) actual_candidates = set() for i in range(len(EXPECTED_CANDIDATES), 0, -1): assert len(active_learner) == i From b70e283811285114df749fc3986e4df225d73c86 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 15 Sep 2022 18:48:28 -0800 Subject: [PATCH 5/5] ref: Don't use DataModel in core --- dedupe/api.py | 4 ++-- dedupe/core.py | 26 +++++++++++++------------- tests/test_core.py | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index 707be427..ed98e520 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -107,7 +107,7 @@ def score(self, pairs: RecordPairs) -> Scores: """ try: matches = core.scoreDuplicates( - pairs, self.data_model, self.classifier, self.num_cores + pairs, self.data_model.distances, self.classifier, self.num_cores ) except RuntimeError: raise RuntimeError( @@ -824,7 +824,7 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]: """ matches = core.scoreGazette( - blocks, self.data_model, self.classifier, self.num_cores + blocks, self.data_model.distances, self.classifier, self.num_cores ) return matches diff --git a/dedupe/core.py b/dedupe/core.py index 7d739af5..64a74f66 100644 --- a/dedupe/core.py +++ b/dedupe/core.py @@ -34,6 +34,7 @@ Classifier, ClosableJoinable, Data, + FeaturizerFunction, Literal, MapLike, RecordID, @@ -41,7 +42,6 @@ RecordPairs, Scores, ) - from dedupe.datamodel import DataModel _Queue = Union[multiprocessing.dummy.Queue, multiprocessing.Queue] @@ -53,7 +53,7 @@ class BlockingError(Exception): class ScoreDupes(object): def __init__( self, - data_model: DataModel, + featurizer: FeaturizerFunction, classifier: Classifier, records_queue: _Queue, exception_queue: _Queue, @@ -61,7 +61,7 @@ def __init__( dtype: numpy.dtype, offset, ): - self.data_model = data_model + self.featurizer = featurizer self.classifier = classifier self.records_queue = records_queue self.exception_queue = exception_queue @@ -87,8 +87,8 @@ def fieldDistance(self, record_pairs: RecordPairs) -> None: if not records: return - distances = self.data_model.distances(records) - scores = self.classifier.predict_proba(distances)[:, -1] + features = self.featurizer(records) + scores = self.classifier.predict_proba(features)[:, -1] mask = scores > 0 if not mask.any(): @@ -113,7 +113,7 @@ def fieldDistance(self, record_pairs: RecordPairs) -> None: def scoreDuplicates( record_pairs: RecordPairs, - data_model: DataModel, + featurizer: FeaturizerFunction, classifier: Classifier, num_cores: int = 1, ) -> Scores: @@ -145,7 +145,7 @@ def scoreDuplicates( n_map_processes = max(num_cores, 1) score_records = ScoreDupes( - data_model, + featurizer, classifier, record_pairs_queue, exception_queue, @@ -200,15 +200,15 @@ def fillQueue( class ScoreGazette(object): - def __init__(self, data_model: DataModel, classifier: Classifier): - self.data_model = data_model + def __init__(self, featurizer: FeaturizerFunction, classifier: Classifier): + self.featurizer = featurizer self.classifier = classifier def __call__(self, block: Block) -> Scores: record_ids, records = zip(*(zip(*each) for each in block)) - distances = self.data_model.distances(records) - scores = self.classifier.predict_proba(distances)[:, -1] + features = self.featurizer(records) + scores = self.classifier.predict_proba(features)[:, -1] id_type = sniff_id_type(record_ids) ids = numpy.array(record_ids, dtype=id_type) @@ -227,7 +227,7 @@ def __call__(self, block: Block) -> Scores: def scoreGazette( record_pairs: Blocks, - data_model: DataModel, + featurizer: FeaturizerFunction, classifier: Classifier, num_cores: int = 1, ) -> Generator[Scores, None, None]: @@ -238,7 +238,7 @@ def scoreGazette( imap, pool = appropriate_imap(num_cores) - score_records = ScoreGazette(data_model, classifier) + score_records = ScoreGazette(featurizer, classifier) for scored_pairs in imap(score_records, record_pairs): yield scored_pairs diff --git a/tests/test_core.py b/tests/test_core.py index 2b4fc1c5..9754f154 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -70,7 +70,7 @@ def setUp(self): def test_score_duplicates(self): scores = dedupe.core.scoreDuplicates( - self.records, self.data_model, self.classifier, 2 + self.records, self.data_model.distances, self.classifier, 2 ) numpy.testing.assert_equal(scores["pairs"], self.desired_scored_pairs["pairs"]) @@ -94,7 +94,7 @@ def test_score_duplicates_with_zeros(self): expected = numpy.array([(["3", "4"], 1)], dtype=dtype) scores = dedupe.core.scoreDuplicates( - records, self.data_model, self.classifier, 2 + records, self.data_model.distances, self.classifier, 2 ) assert isinstance(scores, numpy.memmap)