Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ Performance improvements
- Performance improvement in :func:`merge` with ``how="left"`` and ``sort=False`` when joining on a right index with unique keys (:issue:`65160`)
- Performance improvement in :func:`merge` with ``sort=False`` for single-key ``how="left"``/``how="right"`` joins when the opposite join key is sorted, unique, and range-like (:issue:`64146`)
- Performance improvement in :func:`read_csv` with ``engine="c"`` when reading from binary file-like objects (e.g. PyArrow S3 file handles) by avoiding unnecessary ``TextIOWrapper`` wrapping (:issue:`46823`)
- Performance improvement in :func:`read_hdf` for the fixed (default) format, especially on large frames (:issue:`47726`)
- Performance improvement in :func:`read_html` and the Python CSV parser when ``thousands`` is set, fixing catastrophic regex backtracking on cells with many comma-separated digit groups followed by non-numeric text (:issue:`52619`)
- Performance improvement in :func:`read_sas` by reading page header fields directly in Cython instead of falling back to Python (:issue:`47339`)
- Performance improvement in :func:`read_sas` for SAS7BDAT files by pre-computing date/datetime column classification once during metadata parsing instead of per chunk (:issue:`47339`)
Expand All @@ -172,13 +173,15 @@ Performance improvements
- Performance improvement in :meth:`DataFrame.sort_values` with multiple numeric columns by avoiding unnecessary :class:`Categorical` conversion (:issue:`15389`)
- Performance improvement in :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.mean`, :meth:`DataFrame.any`, and :meth:`DataFrame.all` with ``axis=1`` for multi-block DataFrames by avoiding a transpose (:issue:`51474`)
- Performance improvement in :meth:`DataFrame.to_excel` with the ``openpyxl`` engine when using ``engine_kwargs={"write_only": True}``, reducing memory consumption (:issue:`41681`)
- Performance improvement in :meth:`DataFrame.to_hdf` and :meth:`HDFStore.append` for table format when appending to an existing wide table with many ``data_columns`` (:issue:`25839`)
- Performance improvement in :meth:`DataFrame.to_stata` when writing object-dtype datetime columns with date formats that require year/month extraction (:issue:`64555`)
- Performance improvement in :meth:`DataFrame.unstack` and :meth:`Series.unstack` when the :class:`MultiIndex` is already sorted and the unstacked level is the last level (:issue:`65107`)
- Performance improvement in :meth:`DatetimeIndex.month_name` and :meth:`DatetimeIndex.day_name` when using the default string dtype by using PyArrow compute instead of going through an intermediate object array (:issue:`65104`)
- Performance improvement in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` for formats composed of common directives (``%Y``, ``%m``, ``%d``, ``%H``, ``%M``, ``%S``, ``%f``) (:issue:`44764`)
- Performance improvement in :meth:`GroupBy.any` and :meth:`GroupBy.all` for boolean-dtype columns (:issue:`37850`)
- Performance improvement in :meth:`GroupBy.first` and :meth:`GroupBy.last` for Extension Array dtypes, which no longer fall back to a slow ``apply``-based implementation (:issue:`57591`)
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`64330`)
- Performance improvement in :meth:`HDFStore.select_as_multiple` when no ``where`` clause is given, by avoiding a coordinate-based read (:issue:`26771`)
- Performance improvement in :meth:`Index.get_indexer` for large monotonic indexes, which now uses binary search instead of building a hash table when the number of targets is small (:issue:`14273`)
- Performance improvement in :meth:`Index.join` and :meth:`Index.union` for :class:`RangeIndex` by avoiding unnecessary memory allocation in the libjoin fastpath (:issue:`54646`)
- Performance improvement in :meth:`IntervalIndex.get_indexer` for monotonic non-overlapping indexes, which now uses binary search instead of the interval tree (:issue:`47614`)
Expand Down
38 changes: 32 additions & 6 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,29 @@ def _ensure_term(where, scope_level: int):
_table_file_open_policy_is_strict = False


_MISSING = object()


def _set_attr_if_changed(attrs, name: str, value) -> None:
"""
setattr on a PyTables AttributeSet only if the on-disk value differs.

Re-writing an HDF5 attribute to the same value is expensive — pytables
deletes and re-creates it, hitting the disk per attribute. On wide-table
appends this dominates runtime (GH#25839).
"""
current = getattr(attrs, name, _MISSING)
if current is _MISSING:
setattr(attrs, name, value)
return
try:
equal = bool(current == value)
except (ValueError, TypeError):
equal = False
if not equal:
setattr(attrs, name, value)


def _tables():
global _table_mod
global _table_file_open_policy_is_strict
Expand Down Expand Up @@ -2228,7 +2251,10 @@ def get_result(self, coordinates: bool = False):
return self

# if specified read via coordinates (necessary for multiple selections
if coordinates:
# so each table reads the same row set). Skip when self.where is None
# since every row would be selected anyway, and a coordinate-based read
# is much slower than a sequential read (GH#26771).
if coordinates and self.where is not None:
if not isinstance(self.s, Table):
raise TypeError("can only read_coordinates on a table")
where = self.s.read_coordinates(
Expand Down Expand Up @@ -2533,7 +2559,7 @@ def set_info(self, info) -> None:

def set_attr(self) -> None:
"""set the kind for this column"""
setattr(self.attrs, self.kind_attr, self.kind)
_set_attr_if_changed(self.attrs, self.kind_attr, self.kind)

def validate_metadata(self, handler: AppendableTable) -> None:
"""validate that kind=category does not change the categories"""
Expand Down Expand Up @@ -2888,10 +2914,10 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):

def set_attr(self) -> None:
"""set the data for this column"""
setattr(self.attrs, self.kind_attr, self.values)
setattr(self.attrs, self.meta_attr, self.meta)
_set_attr_if_changed(self.attrs, self.kind_attr, self.values)
_set_attr_if_changed(self.attrs, self.meta_attr, self.meta)
assert self.dtype is not None
setattr(self.attrs, self.dtype_attr, self.dtype)
_set_attr_if_changed(self.attrs, self.dtype_attr, self.dtype)


class DataIndexableCol(DataCol):
Expand Down Expand Up @@ -3633,7 +3659,7 @@ def read(
dfs.append(df)

if len(dfs) > 0:
out = concat(dfs, axis=1).copy()
out = concat(dfs, axis=1)
return out.reindex(columns=items)

return DataFrame(columns=axes[0], index=axes[1])
Expand Down
Loading