Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ Performance improvements
- Performance improvement in :func:`util.hash_pandas_object` for PyArrow-backed string and binary types by using PyArrow's ``dictionary_encode`` instead of converting to NumPy for factorization (:issue:`48964`)
- Performance improvement in :meth:`.DataFrameGroupBy.agg` and :meth:`.SeriesGroupBy.agg` with user-defined functions (:issue:`46505`)
- Performance improvement in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` when data contains no NaN values (:issue:`64857`)
- Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`)
- Performance improvement in :meth:`DataFrame.fillna` and :meth:`Series.fillna` with scalar fill value for float, object, nullable, and datetime-like dtypes (:issue:`42147`)
- Performance improvement in :meth:`DataFrame.from_records` when passing a 2D :class:`numpy.ndarray` (:issue:`22025`)
- Performance improvement in :meth:`DataFrame.insert` when the number of blocks is small (:issue:`57641`)
Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/algos.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ def nancorr_spearman(
mat: npt.NDArray[np.float64], # ndarray[float64_t, ndim=2]
minp: int = ...,
) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2]
def nancorr_kendall(
mat: npt.NDArray[np.float64], # ndarray[float64_t, ndim=2]
minp: int = ...,
) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2]

# ----------------------------------------------------------------------

Expand Down
215 changes: 215 additions & 0 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ from libc.math cimport (
from libc.stdlib cimport (
free,
malloc,
qsort,
)
from libc.string cimport memcpy

Expand Down Expand Up @@ -1760,6 +1761,220 @@ def axis_kurt(
return result_arr


# ----------------------------------------------------------------------
# Pairwise Kendall correlation
#
# References:
# - Knight, W. R. (1966).
# A computer method for calculating Kendall's tau with ungrouped data.
# Journal of the American Statistical Association, 61(314), 436-439.
#
# - rankcorr.jl
# https://github.com/JuliaStats/StatsBase.jl/blob/3e5382fa6b6ac90cf3d0b1904484668714d0e220/src/rankcorr.jl

cdef struct Pair:
float64_t x
float64_t y

cdef int compare_pair_aux(const void* a, const void* b) noexcept nogil:
cdef:
Pair *p1 = <Pair*>a
Pair *p2 = <Pair*>b
if p1.x < p2.x:
return -1
if p1.x > p2.x:
return 1
if p1.y < p2.y:
return -1
if p1.y > p2.y:
return 1
return 0


@cython.boundscheck(False)
@cython.wraparound(False)
cdef int64_t insertion_sort(
float64_t[:] v,
Py_ssize_t lo,
Py_ssize_t hi
) noexcept nogil:
cdef:
int64_t nswaps = 0
Py_ssize_t i, j
float64_t x

for i in range(lo + 1, hi + 1):
j = i
x = v[i]
while j > lo and x < v[j - 1]:
nswaps += 1
v[j] = v[j - 1]
j -= 1
v[j] = x
return nswaps


@cython.boundscheck(False)
@cython.wraparound(False)
cdef int64_t merge_sort(
float64_t[:] v,
float64_t[:] t,
Py_ssize_t lo,
Py_ssize_t hi
) noexcept nogil:
cdef:
int64_t nswaps = 0
Py_ssize_t m, i, j, k

if lo < hi:
if hi - lo <= 64:
return insertion_sort(v, lo, hi)

m = lo + (hi - lo) // 2
nswaps = merge_sort(v, t, lo, m)
nswaps += merge_sort(v, t, m + 1, hi)

for i in range(lo, m + 1):
t[i - lo] = v[i]

i = 0
j = m + 1
k = lo
while k < j <= hi and i <= m - lo:
if v[j] < t[i]:
v[k] = v[j]
j += 1
nswaps += (m - lo + 1 - i)
else:
v[k] = t[i]
i += 1
k += 1

while i <= m - lo:
v[k] = t[i]
k += 1
i += 1
return nswaps


@cython.boundscheck(False)
@cython.wraparound(False)
cdef int64_t count_ties(
const float64_t[:] v,
Py_ssize_t lo,
Py_ssize_t hi
) noexcept nogil:
cdef:
int64_t result = 0
int64_t tie_count = 0
Py_ssize_t i

for i in range(lo + 1, hi + 1):
if v[i] == v[i - 1]:
tie_count += 1
elif tie_count > 0:
result += tie_count * (tie_count + 1) // 2
tie_count = 0
if tie_count > 0:
result += tie_count * (tie_count + 1) // 2
return result


@cython.boundscheck(False)
@cython.wraparound(False)
cdef float64_t compute_kendall_corr_1d(
const float64_t[:] x,
float64_t[:] y,
float64_t[:] temp_t
) noexcept nogil:
cdef:
Py_ssize_t n = len(x)
int64_t npairs = <int64_t>n * (n - 1) // 2
int64_t ntiesx = 0
int64_t ntiesy = 0
int64_t ndoubleties = 0
int64_t nswaps = 0
Py_ssize_t i, k = 0
float64_t divisor

if n < 2:
return NaN

for i in range(1, n):
if x[i] == x[i-1]:
k += 1
elif k > 0:
ndoubleties += count_ties(y, i - k - 1, i - 1)
ntiesx += <int64_t>k * (k + 1) // 2
k = 0
if k > 0:
ndoubleties += count_ties(y, n - k - 1, n - 1)
ntiesx += <int64_t>k * (k + 1) // 2

nswaps = merge_sort(y, temp_t, 0, n - 1)
ntiesy = count_ties(y, 0, n - 1)

divisor = sqrt(<float64_t>(npairs - ntiesx) * <float64_t>(npairs - ntiesy))
if divisor == 0:
return NaN

return (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / divisor


@cython.boundscheck(False)
@cython.wraparound(False)
def nancorr_kendall(const float64_t[:, :] mat, Py_ssize_t minp=1):
cdef:
Py_ssize_t i, xi, yi, N, K, nobs
ndarray[float64_t, ndim=2] result
ndarray[uint8_t, ndim=2] mask
float64_t[::1] x_obs, y_obs, temp_t
Pair *pairs = NULL
float64_t val

N, K = (<object>mat).shape
result = np.empty((K, K), dtype=np.float64)
mask = np.isfinite(mat).view(np.uint8)

pairs = <Pair*>malloc(N * sizeof(Pair))
if pairs is NULL:
raise MemoryError()

x_obs = np.empty(N, dtype=np.float64)
y_obs = np.empty(N, dtype=np.float64)
temp_t = np.empty(N, dtype=np.float64)

with nogil:
for xi in range(K):
for yi in range(xi + 1):
nobs = 0
for i in range(N):
if mask[i, xi] and mask[i, yi]:
pairs[nobs].x = mat[i, xi]
pairs[nobs].y = mat[i, yi]
nobs += 1

if nobs < minp:
result[xi, yi] = result[yi, xi] = NaN
continue

if xi == yi:
result[xi, yi] = 1.0
continue

qsort(pairs, nobs, sizeof(Pair), compare_pair_aux)

for i in range(nobs):
x_obs[i] = pairs[i].x
y_obs[i] = pairs[i].y

val = compute_kendall_corr_1d(x_obs[:nobs], y_obs[:nobs], temp_t[:nobs])
result[xi, yi] = result[yi, xi] = val
free(pairs)

return result


# generated from template
include "algos_common_helper.pxi"
include "algos_take_helper.pxi"
4 changes: 3 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -15334,7 +15334,9 @@ def corr(
correl = libalgos.nancorr(mat, minp=min_periods)
elif method == "spearman":
correl = libalgos.nancorr_spearman(mat, minp=min_periods)
elif method == "kendall" or callable(method):
elif method == "kendall":
correl = libalgos.nancorr_kendall(mat, minp=min_periods)
elif callable(method):
if min_periods is None:
min_periods = 1
mat = mat.T
Expand Down
Loading