diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 49fdc5c746795..77e3327de7ef1 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -163,6 +163,7 @@ Performance improvements - Performance improvement in :func:`util.hash_pandas_object` for PyArrow-backed string and binary types by using PyArrow's ``dictionary_encode`` instead of converting to NumPy for factorization (:issue:`48964`) - Performance improvement in :meth:`.DataFrameGroupBy.agg` and :meth:`.SeriesGroupBy.agg` with user-defined functions (:issue:`46505`) - Performance improvement in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` when data contains no NaN values (:issue:`64857`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`) - Performance improvement in :meth:`DataFrame.fillna` and :meth:`Series.fillna` with scalar fill value for float, object, nullable, and datetime-like dtypes (:issue:`42147`) - Performance improvement in :meth:`DataFrame.from_records` when passing a 2D :class:`numpy.ndarray` (:issue:`22025`) - Performance improvement in :meth:`DataFrame.insert` when the number of blocks is small (:issue:`57641`) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 5f43e2665d02b..b26d98c43d261 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -52,6 +52,10 @@ def nancorr_spearman( mat: npt.NDArray[np.float64], # ndarray[float64_t, ndim=2] minp: int = ..., ) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2] +def nancorr_kendall( + mat: npt.NDArray[np.float64], # ndarray[float64_t, ndim=2] + minp: int = ..., +) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2] # ---------------------------------------------------------------------- diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 97aa64593a3fc..89f9e48e14407 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -7,6 +7,7 @@ from libc.math cimport ( from libc.stdlib cimport ( free, malloc, + qsort, ) from libc.string cimport memcpy @@ -1760,6 +1761,220 @@ def axis_kurt( return result_arr +# ---------------------------------------------------------------------- +# Pairwise Kendall correlation +# +# References: +# - Knight, W. R. (1966). +# A computer method for calculating Kendall's tau with ungrouped data. +# Journal of the American Statistical Association, 61(314), 436-439. +# +# - rankcorr.jl +# https://github.com/JuliaStats/StatsBase.jl/blob/3e5382fa6b6ac90cf3d0b1904484668714d0e220/src/rankcorr.jl + +cdef struct Pair: + float64_t x + float64_t y + +cdef int compare_pair_aux(const void* a, const void* b) noexcept nogil: + cdef: + Pair *p1 = a + Pair *p2 = b + if p1.x < p2.x: + return -1 + if p1.x > p2.x: + return 1 + if p1.y < p2.y: + return -1 + if p1.y > p2.y: + return 1 + return 0 + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef int64_t insertion_sort( + float64_t[:] v, + Py_ssize_t lo, + Py_ssize_t hi +) noexcept nogil: + cdef: + int64_t nswaps = 0 + Py_ssize_t i, j + float64_t x + + for i in range(lo + 1, hi + 1): + j = i + x = v[i] + while j > lo and x < v[j - 1]: + nswaps += 1 + v[j] = v[j - 1] + j -= 1 + v[j] = x + return nswaps + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef int64_t merge_sort( + float64_t[:] v, + float64_t[:] t, + Py_ssize_t lo, + Py_ssize_t hi +) noexcept nogil: + cdef: + int64_t nswaps = 0 + Py_ssize_t m, i, j, k + + if lo < hi: + if hi - lo <= 64: + return insertion_sort(v, lo, hi) + + m = lo + (hi - lo) // 2 + nswaps = merge_sort(v, t, lo, m) + nswaps += merge_sort(v, t, m + 1, hi) + + for i in range(lo, m + 1): + t[i - lo] = v[i] + + i = 0 + j = m + 1 + k = lo + while k < j <= hi and i <= m - lo: + if v[j] < t[i]: + v[k] = v[j] + j += 1 + nswaps += (m - lo + 1 - i) + else: + v[k] = t[i] + i += 1 + k += 1 + + while i <= m - lo: + v[k] = t[i] + k += 1 + i += 1 + return nswaps + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef int64_t count_ties( + const float64_t[:] v, + Py_ssize_t lo, + Py_ssize_t hi +) noexcept nogil: + cdef: + int64_t result = 0 + int64_t tie_count = 0 + Py_ssize_t i + + for i in range(lo + 1, hi + 1): + if v[i] == v[i - 1]: + tie_count += 1 + elif tie_count > 0: + result += tie_count * (tie_count + 1) // 2 + tie_count = 0 + if tie_count > 0: + result += tie_count * (tie_count + 1) // 2 + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef float64_t compute_kendall_corr_1d( + const float64_t[:] x, + float64_t[:] y, + float64_t[:] temp_t +) noexcept nogil: + cdef: + Py_ssize_t n = len(x) + int64_t npairs = n * (n - 1) // 2 + int64_t ntiesx = 0 + int64_t ntiesy = 0 + int64_t ndoubleties = 0 + int64_t nswaps = 0 + Py_ssize_t i, k = 0 + float64_t divisor + + if n < 2: + return NaN + + for i in range(1, n): + if x[i] == x[i-1]: + k += 1 + elif k > 0: + ndoubleties += count_ties(y, i - k - 1, i - 1) + ntiesx += k * (k + 1) // 2 + k = 0 + if k > 0: + ndoubleties += count_ties(y, n - k - 1, n - 1) + ntiesx += k * (k + 1) // 2 + + nswaps = merge_sort(y, temp_t, 0, n - 1) + ntiesy = count_ties(y, 0, n - 1) + + divisor = sqrt((npairs - ntiesx) * (npairs - ntiesy)) + if divisor == 0: + return NaN + + return (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) / divisor + + +@cython.boundscheck(False) +@cython.wraparound(False) +def nancorr_kendall(const float64_t[:, :] mat, Py_ssize_t minp=1): + cdef: + Py_ssize_t i, xi, yi, N, K, nobs + ndarray[float64_t, ndim=2] result + ndarray[uint8_t, ndim=2] mask + float64_t[::1] x_obs, y_obs, temp_t + Pair *pairs = NULL + float64_t val + + N, K = (mat).shape + result = np.empty((K, K), dtype=np.float64) + mask = np.isfinite(mat).view(np.uint8) + + pairs = malloc(N * sizeof(Pair)) + if pairs is NULL: + raise MemoryError() + + x_obs = np.empty(N, dtype=np.float64) + y_obs = np.empty(N, dtype=np.float64) + temp_t = np.empty(N, dtype=np.float64) + + with nogil: + for xi in range(K): + for yi in range(xi + 1): + nobs = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: + pairs[nobs].x = mat[i, xi] + pairs[nobs].y = mat[i, yi] + nobs += 1 + + if nobs < minp: + result[xi, yi] = result[yi, xi] = NaN + continue + + if xi == yi: + result[xi, yi] = 1.0 + continue + + qsort(pairs, nobs, sizeof(Pair), compare_pair_aux) + + for i in range(nobs): + x_obs[i] = pairs[i].x + y_obs[i] = pairs[i].y + + val = compute_kendall_corr_1d(x_obs[:nobs], y_obs[:nobs], temp_t[:nobs]) + result[xi, yi] = result[yi, xi] = val + free(pairs) + + return result + + # generated from template include "algos_common_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b266a0878fbb5..b887cd7f71248 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15334,7 +15334,9 @@ def corr( correl = libalgos.nancorr(mat, minp=min_periods) elif method == "spearman": correl = libalgos.nancorr_spearman(mat, minp=min_periods) - elif method == "kendall" or callable(method): + elif method == "kendall": + correl = libalgos.nancorr_kendall(mat, minp=min_periods) + elif callable(method): if min_periods is None: min_periods = 1 mat = mat.T