dedupeio · NickCrews · Sep 16, 2022 · Sep 16, 2022 · Sep 16, 2022 · Sep 16, 2022
diff --git a/dedupe/_typing.py b/dedupe/_typing.py
@@ -80,6 +80,12 @@ class TrainingData(TypedDict):
     distinct: List[RecordDictPair]
 
 
+# Takes pairs of records and generates a (n_samples X n_features) array
+FeaturizerFunction = Callable[
+    [Sequence[RecordDictPair]], numpy.typing.NDArray[numpy.float_]
+]
+
+
 class Classifier(Protocol):
     """Takes an array of pairwise distances and computes the likelihood they are a pair."""
 

diff --git a/dedupe/api.py b/dedupe/api.py
@@ -14,6 +14,7 @@
 import sqlite3
 import tempfile
 import warnings
+from io import BytesIO
 from typing import TYPE_CHECKING, cast
 
 import numpy
@@ -107,7 +108,7 @@ def score(self, pairs: RecordPairs) -> Scores:
         """
         try:
             matches = core.scoreDuplicates(
-                pairs, self.data_model, self.classifier, self.num_cores
+                pairs, self.data_model.distances, self.classifier, self.num_cores
             )
         except RuntimeError:
             raise RuntimeError(
@@ -824,7 +825,7 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]:
         """
 
         matches = core.scoreGazette(
-            blocks, self.data_model, self.classifier, self.num_cores
+            blocks, self.data_model.distances, self.classifier, self.num_cores
         )
 
         return matches
@@ -978,10 +979,37 @@ def __init__(
         """
         super().__init__(num_cores, in_memory, **kwargs)
 
+        self.data_model, self.classifier, self.predicates = self._load_settings(
+            settings_file
+        )
+
+        logger.info("Predicate set:")
+        for predicate in self.predicates:
+            logger.info(predicate)
+
+        self._fingerprinter = blocking.Fingerprinter(self.predicates)
+
+    @classmethod
+    def _load_settings(
+        cls, settings_file: BinaryIO
+    ) -> tuple[datamodel.DataModel, Classifier, list[dedupe.predicates.Predicate]]:
+        # Make a copy so we can read it multiple times
+        settings_file = BytesIO(settings_file.read())
+        settings_file.seek(0)
+        catchall_exception = SettingsFileLoadingException(
+            "Something has gone wrong with loading the settings file. "
+            "Try deleting the file"
+        )
         try:
-            self.data_model = pickle.load(settings_file)
-            self.classifier = pickle.load(settings_file)
-            self.predicates = pickle.load(settings_file)
+            version = pickle.load(settings_file)
+            if not isinstance(version, int):
+                settings_file.seek(0)
+                return cls._load_settings_v0(settings_file)
+            else:
+                raise SettingsFileLoadingException(
+                    "Settings file version {} not understood".format(version)
+                )
+
         except (KeyError, AttributeError):
             raise SettingsFileLoadingException(
                 "This settings file is not compatible with "
@@ -997,21 +1025,18 @@ def __init__(
                     "install that library: `pip install rlr`"
                 )
             else:
-                raise SettingsFileLoadingException(
-                    "Something has gone wrong with loading the settings file. "
-                    "Try deleting the file"
-                ) from exc
-        except:  # noqa: E722
-            raise SettingsFileLoadingException(
-                "Something has gone wrong with loading the settings file. "
-                "Try deleting the file"
-            )
+                raise
+        except Exception as exc:
+            raise catchall_exception from exc
 
-        logger.info("Predicate set:")
-        for predicate in self.predicates:
-            logger.info(predicate)
-
-        self._fingerprinter = blocking.Fingerprinter(self.predicates)
+    @staticmethod
+    def _load_settings_v0(
+        settings_file: BinaryIO,
+    ) -> tuple[datamodel.DataModel, Classifier, list[dedupe.predicates.Predicate]]:
+        data_model = pickle.load(settings_file)
+        classifier = pickle.load(settings_file)
+        predicates = pickle.load(settings_file)
+        return data_model, classifier, predicates
 
 
 class ActiveMatching(Matching):
@@ -1325,7 +1350,8 @@ def prepare_training(
         examples, y = flatten_training(self.training_pairs)
 
         self.active_learner = labeler.DedupeDisagreementLearner(
-            self.data_model,
+            self.data_model.predicates,
+            self.data_model.distances,
             data,
             index_include=examples,
         )
@@ -1392,7 +1418,8 @@ def prepare_training(
         examples, y = flatten_training(self.training_pairs)
 
         self.active_learner = labeler.RecordLinkDisagreementLearner(
-            self.data_model,
+            self.data_model.predicates,
+            self.data_model.distances,
             data_1,
             data_2,
             index_include=examples,

diff --git a/dedupe/core.py b/dedupe/core.py
@@ -34,14 +34,14 @@
         Classifier,
         ClosableJoinable,
         Data,
+        FeaturizerFunction,
         Literal,
         MapLike,
         RecordID,
         RecordIDDType,
         RecordPairs,
         Scores,
     )
-    from dedupe.datamodel import DataModel
 
     _Queue = Union[multiprocessing.dummy.Queue, multiprocessing.Queue]
 
@@ -53,15 +53,15 @@ class BlockingError(Exception):
 class ScoreDupes(object):
     def __init__(
         self,
-        data_model: DataModel,
+        featurizer: FeaturizerFunction,
         classifier: Classifier,
         records_queue: _Queue,
         exception_queue: _Queue,
         score_file_path: str,
         dtype: numpy.dtype,
         offset,
     ):
-        self.data_model = data_model
+        self.featurizer = featurizer
         self.classifier = classifier
         self.records_queue = records_queue
         self.exception_queue = exception_queue
@@ -87,8 +87,8 @@ def fieldDistance(self, record_pairs: RecordPairs) -> None:
         if not records:
             return
 
-        distances = self.data_model.distances(records)
-        scores = self.classifier.predict_proba(distances)[:, -1]
+        features = self.featurizer(records)
+        scores = self.classifier.predict_proba(features)[:, -1]
 
         mask = scores > 0
         if not mask.any():
@@ -113,7 +113,7 @@ def fieldDistance(self, record_pairs: RecordPairs) -> None:
 
 def scoreDuplicates(
     record_pairs: RecordPairs,
-    data_model: DataModel,
+    featurizer: FeaturizerFunction,
     classifier: Classifier,
     num_cores: int = 1,
 ) -> Scores:
@@ -145,7 +145,7 @@ def scoreDuplicates(
 
     n_map_processes = max(num_cores, 1)
     score_records = ScoreDupes(
-        data_model,
+        featurizer,
         classifier,
         record_pairs_queue,
         exception_queue,
@@ -200,15 +200,15 @@ def fillQueue(
 
 
 class ScoreGazette(object):
-    def __init__(self, data_model: DataModel, classifier: Classifier):
-        self.data_model = data_model
+    def __init__(self, featurizer: FeaturizerFunction, classifier: Classifier):
+        self.featurizer = featurizer
         self.classifier = classifier
 
     def __call__(self, block: Block) -> Scores:
         record_ids, records = zip(*(zip(*each) for each in block))
 
-        distances = self.data_model.distances(records)
-        scores = self.classifier.predict_proba(distances)[:, -1]
+        features = self.featurizer(records)
+        scores = self.classifier.predict_proba(features)[:, -1]
 
         id_type = sniff_id_type(record_ids)
         ids = numpy.array(record_ids, dtype=id_type)
@@ -227,7 +227,7 @@ def __call__(self, block: Block) -> Scores:
 
 def scoreGazette(
     record_pairs: Blocks,
-    data_model: DataModel,
+    featurizer: FeaturizerFunction,
     classifier: Classifier,
     num_cores: int = 1,
 ) -> Generator[Scores, None, None]:
@@ -238,7 +238,7 @@ def scoreGazette(
 
     imap, pool = appropriate_imap(num_cores)
 
-    score_records = ScoreGazette(data_model, classifier)
+    score_records = ScoreGazette(featurizer, classifier)
 
     for scored_pairs in imap(score_records, record_pairs):
         yield scored_pairs