Skip to content
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions dedupe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from pkgutil import extend_path

__path__ = extend_path(__path__, __name__)

from dedupe._init import * # noqa
from dedupe.api import ( # noqa: F401
Dedupe,
Gazetteer,
RecordLink,
StaticDedupe,
StaticGazetteer,
StaticRecordLink,
)
from dedupe.convenience import ( # noqa: F401
canonicalize,
console_label,
training_data_dedupe,
training_data_link,
)
from dedupe.serializer import read_training, write_training # noqa: F401
15 changes: 0 additions & 15 deletions dedupe/_init.py

This file was deleted.

40 changes: 30 additions & 10 deletions dedupe/datamodel.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
from __future__ import annotations

import copyreg
import pkgutil
import importlib
import types
from typing import TYPE_CHECKING, cast

import numpy
import pluggy

import dedupe.variables
import dedupe.hookspecs
from dedupe.variables.base import FieldType as FieldVariable
from dedupe.variables.base import MissingDataType, Variable
from dedupe.variables.interaction import InteractionType

for _, module, _ in pkgutil.iter_modules( # type: ignore
dedupe.variables.__path__, "dedupe.variables."
):
__import__(module)

if TYPE_CHECKING:
from typing import Generator, Iterable, Sequence

Expand All @@ -28,7 +24,26 @@
)
from dedupe.predicates import Predicate

VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k}

DEFAULT_VARIABLES = [
"dedupe.variables.base",
"dedupe.variables.string",
"dedupe.variables.categorical_type",
"dedupe.variables.exists",
"dedupe.variables.exact",
"dedupe.variables.latlong",
"dedupe.variables.interaction",
"dedupe.variables.price",
"dedupe.variables.set",
]

pm = pluggy.PluginManager("dedupe")
pm.add_hookspecs(dedupe.hookspecs)
pm.load_setuptools_entrypoints("dedupe")

for plugin in DEFAULT_VARIABLES:
mod = importlib.import_module(plugin)
pm.register(mod, plugin)


class DataModel(object):
Expand Down Expand Up @@ -145,6 +160,11 @@ def __setstate__(self, d):
def typify_variables(
variable_definitions: Iterable[VariableDefinition],
) -> tuple[list[FieldVariable], list[Variable]]:

variable_types = {}
for variable_type in pm.hook.register_variable():
variable_types.update(variable_type)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be good to do some protocol checking here.


primary_variables: list[FieldVariable] = []
all_variables: list[Variable] = []
only_custom = True
Expand Down Expand Up @@ -181,11 +201,11 @@ def typify_variables(
]

try:
variable_class = VARIABLE_CLASSES[variable_type]
variable_class = variable_types[variable_type]
except KeyError:
raise KeyError(
"Field type %s not valid. Valid types include %s"
% (definition["type"], ", ".join(VARIABLE_CLASSES))
% (definition["type"], ", ".join(variable_types))
)

variable_object = variable_class(definition)
Expand Down
9 changes: 9 additions & 0 deletions dedupe/hookspecs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import pluggy

hookimpl = pluggy.HookimplMarker("dedupe")
hookspec = pluggy.HookspecMarker("dedupe")


@hookspec
def register_variable():
"""Register a variable for use in a datamodel"""
3 changes: 0 additions & 3 deletions dedupe/variables/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from pkgutil import extend_path

__path__ = extend_path(__path__, __name__)
17 changes: 7 additions & 10 deletions dedupe/variables/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from typing import TYPE_CHECKING

from dedupe import predicates
from dedupe.hookspecs import hookimpl

if TYPE_CHECKING:
from typing import Any, ClassVar, Generator, Iterable, Optional, Sequence, Type
from typing import Any, ClassVar, Iterable, Sequence, Type

from dedupe._typing import Comparator, PredicateFunction, VariableDefinition

Expand Down Expand Up @@ -47,15 +48,6 @@ def __getstate__(self) -> dict[str, Any]:

return odict

@classmethod
def all_subclasses(
cls,
) -> Generator[tuple[Optional[str], Type["Variable"]], None, None]:
for q in cls.__subclasses__():
yield getattr(q, "type", None), q
for p in q.all_subclasses():
yield p


class DerivedType(Variable):
type = "Derived"
Expand Down Expand Up @@ -135,3 +127,8 @@ def indexPredicates(
index_predicates.append(predicate(threshold, field))

return index_predicates


@hookimpl
def register_variable():
return {CustomType.type: CustomType}
6 changes: 6 additions & 0 deletions dedupe/variables/categorical_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from dedupe import predicates
from dedupe._typing import PredicateFunction, VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import DerivedType, FieldType


Expand Down Expand Up @@ -36,3 +37,8 @@ def __init__(self, definition: VariableDefinition):

def __len__(self) -> int:
return len(self.higher_vars)


@hookimpl
def register_variable():
return {CategoricalType.type: CategoricalType}
6 changes: 6 additions & 0 deletions dedupe/variables/exact.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Any

from dedupe import predicates
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType


Expand All @@ -14,3 +15,8 @@ def comparator(field_1: Any, field_2: Any) -> int:
return 1
else:
return 0


@hookimpl
def register_variable():
return {ExactType.type: ExactType}
6 changes: 6 additions & 0 deletions dedupe/variables/exists.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from categorical import CategoricalComparator

from dedupe._typing import PredicateFunction, VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import DerivedType
from dedupe.variables.categorical_type import CategoricalType

Expand Down Expand Up @@ -37,3 +38,8 @@ def comparator(self, field_1: Any, field_2: Any) -> list[int]:
# This flag tells fieldDistances in dedupe.core to pass
# missing values (None) into the comparator
comparator.missing = True # type: ignore


@hookimpl
def register_variable():
return {ExistsType.type: ExistsType}
6 changes: 6 additions & 0 deletions dedupe/variables/interaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Mapping

from dedupe._typing import VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType as FieldVariable
from dedupe.variables.base import Variable

Expand Down Expand Up @@ -77,3 +78,8 @@ def atomicInteractions(
atomic_interactions.append(field)

return atomic_interactions


@hookimpl
def register_variable():
return {InteractionType.type: InteractionType}
6 changes: 6 additions & 0 deletions dedupe/variables/latlong.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from haversine import haversine

from dedupe import predicates
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType


Expand All @@ -16,3 +17,8 @@ class LatLongType(FieldType):
@staticmethod
def comparator(x: tuple[float, float], y: tuple[float, float]) -> float:
return sqrt(haversine(x, y))


@hookimpl
def register_variable():
return {LatLongType.type: LatLongType}
6 changes: 6 additions & 0 deletions dedupe/variables/price.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy

from dedupe import predicates
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType


Expand All @@ -22,3 +23,8 @@ def comparator(price_1: int | float, price_2: int | float) -> float:
return numpy.nan
else:
return abs(numpy.log10(price_1) - numpy.log10(price_2))


@hookimpl
def register_variable():
return {PriceType.type: PriceType}
6 changes: 6 additions & 0 deletions dedupe/variables/set.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from dedupe import predicates
from dedupe._typing import VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType


Expand Down Expand Up @@ -31,3 +32,8 @@ def __init__(self, definition: VariableDefinition):
definition["corpus"] = []

self.comparator = CosineSetSimilarity(definition["corpus"]) # type: ignore[assignment]


@hookimpl
def register_variable():
return {SetType.type: SetType}
10 changes: 10 additions & 0 deletions dedupe/variables/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from dedupe import predicates
from dedupe._typing import VariableDefinition
from dedupe.hookspecs import hookimpl
from dedupe.variables.base import FieldType, indexPredicates

crfEd = CRFEditDistance()
Expand Down Expand Up @@ -105,3 +106,12 @@ def __init__(self, definition: VariableDefinition):
definition["corpus"] = []

self.comparator = CosineTextSimilarity(definition["corpus"]) # type: ignore[assignment]


@hookimpl
def register_variable():
return {
ShortStringType.type: ShortStringType,
StringType.type: StringType,
TextType.type: TextType,
}
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "dedupe"
description = "A python library for accurate and scaleable data deduplication and entity-resolution"
version = "2.0.19"
version = "3.0.0"
readme = "README.md"
requires-python = ">=3.7"
license = {file = "LICENSE"}
Expand Down Expand Up @@ -38,6 +38,7 @@ dependencies = [
"zope.index",
"Levenshtein_search==1.4.5",
"typing_extensions",
"pluggy",
]

[project.urls]
Expand All @@ -51,7 +52,7 @@ MailingList = "https://groups.google.com/forum/#!forum/open-source-deduplication


[build-system]
requires = ["setuptools==63",
requires = ["setuptools",
"wheel",
"cython"]
build-backend = "setuptools.build_meta"
Expand Down