Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,6 @@ repos:
entry: '[a-zA-Z0-9]\`\`?[a-zA-Z0-9]'
types: [rst]
files: ^doc/source/
- id: seed-check-asv
name: Check for unnecessary random seeds in asv benchmarks
language: pygrep
entry: 'np\.random\.seed'
files: ^asv_bench/benchmarks
exclude: ^asv_bench/benchmarks/pandas_vb_common\.py
- id: unwanted-patterns-in-tests
name: Unwanted patterns in tests
language: pygrep
Expand Down
91 changes: 54 additions & 37 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,57 +12,70 @@
pass


def _make_factorize_data(dtype, N):
if dtype in ("int64", "Int64", "object"):
data = pd.Index(np.arange(N), dtype=dtype)
elif dtype == "float64":
data = pd.Index(np.random.randn(N), dtype=dtype)
elif dtype == "datetime64[ns]":
data = pd.date_range("2011-01-01", freq="h", periods=N)
elif dtype == "datetime64[ns, tz]":
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
elif dtype == "object_str":
data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
elif dtype == "string[pyarrow]":
data = pd.array(
pd.Index([f"i-{i}" for i in range(N)], dtype=object),
dtype="string[pyarrow]",
)
else:
raise NotImplementedError
return data


_FACTORIZE_DTYPES = [
"int64",
"float64",
"object",
"object_str",
"datetime64[ns]",
"datetime64[ns, tz]",
"Int64",
"string[pyarrow]",
]


class Factorize:
params = [
[True, False],
[True, False],
[
"int64",
"uint64",
"float64",
"object",
"object_str",
"datetime64[ns]",
"datetime64[ns, tz]",
"Int64",
"boolean",
"string[pyarrow]",
],
_FACTORIZE_DTYPES,
]
param_names = ["unique", "sort", "dtype"]

def setup(self, unique, sort, dtype):
N = 10**5

if dtype in ["int64", "uint64", "Int64", "object"]:
data = pd.Index(np.arange(N), dtype=dtype)
elif dtype == "float64":
data = pd.Index(np.random.randn(N), dtype=dtype)
elif dtype == "boolean":
data = pd.array(np.random.randint(0, 2, N), dtype=dtype)
elif dtype == "datetime64[ns]":
data = pd.date_range("2011-01-01", freq="h", periods=N)
elif dtype == "datetime64[ns, tz]":
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
elif dtype == "object_str":
data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
elif dtype == "string[pyarrow]":
data = pd.array(
pd.Index([f"i-{i}" for i in range(N)], dtype=object),
dtype="string[pyarrow]",
)
else:
raise NotImplementedError

data = _make_factorize_data(dtype, N)
if not unique:
data = data.repeat(5)
self.data = data

def time_factorize(self, unique, sort, dtype):
pd.factorize(self.data, sort=sort)

def peakmem_factorize(self, unique, sort, dtype):
pd.factorize(self.data, sort=sort)

class FactorizePeakmem:
# peakmem is driven by allocation patterns that vary by dtype; unique/sort
# are held fixed to keep this benchmark small.
params = _FACTORIZE_DTYPES
param_names = ["dtype"]

def setup(self, dtype):
N = 10**5
self.data = _make_factorize_data(dtype, N).repeat(5)

def peakmem_factorize(self, dtype):
pd.factorize(self.data, sort=False)


class Duplicated:
Expand Down Expand Up @@ -132,6 +145,10 @@ def time_duplicated(self, unique, keep, dtype):

class Hashing:
def setup_cache(self):
# setup_cache runs in a different sub-process where module-level setup is
# never called,so seed locally to keep the cached frame reproducible
# across runs.
np.random.seed(1234)
N = 10**5

df = pd.DataFrame(
Expand Down Expand Up @@ -177,13 +194,13 @@ class Quantile:
params = [
[0, 0.5, 1],
["linear", "nearest", "lower", "higher", "midpoint"],
["float64", "int64", "uint64"],
["float64", "int64"],
]
param_names = ["quantile", "interpolation", "dtype"]

def setup(self, quantile, interpolation, dtype):
N = 10**5
if dtype in ["int64", "uint64"]:
if dtype == "int64":
data = np.arange(N, dtype=dtype)
elif dtype == "float64":
data = np.random.randn(N)
Expand Down
Loading