diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 49c251eb5e06a..e9219b7fc4ca0 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -180,6 +180,7 @@ Performance improvements - Performance improvement in :meth:`GroupBy.any` and :meth:`GroupBy.all` for boolean-dtype columns (:issue:`37850`) - Performance improvement in :meth:`GroupBy.first` and :meth:`GroupBy.last` for Extension Array dtypes, which no longer fall back to a slow ``apply``-based implementation (:issue:`57591`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`64330`) +- Performance improvement in :meth:`GroupBy.size` (:issue:`51750`) - Performance improvement in :meth:`Index.get_indexer` for large monotonic indexes, which now uses binary search instead of building a hash table when the number of targets is small (:issue:`14273`) - Performance improvement in :meth:`Index.join` and :meth:`Index.union` for :class:`RangeIndex` by avoiding unnecessary memory allocation in the libjoin fastpath (:issue:`54646`) - Performance improvement in :meth:`IntervalIndex.get_indexer` for monotonic non-overlapping indexes, which now uses binary search instead of the interval tree (:issue:`47614`) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 388179eae7316..30b8956753664 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -734,7 +734,10 @@ def size(self) -> Series: ngroups = self.ngroups out: np.ndarray | list if ngroups: - out = np.bincount(ids[ids != -1], minlength=ngroups) + if self.has_dropped_na: + out = np.bincount(ids + 1, minlength=ngroups + 1)[1:] + else: + out = np.bincount(ids, minlength=ngroups) else: out = [] return Series(out, index=self.result_index, dtype="int64", copy=False)