diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index ba68d142e1eda..fbfb07d2a0f02 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -152,6 +152,7 @@ Performance improvements - Performance improvement in :func:`merge` with ``how="left"`` and ``sort=False`` when joining on a right index with unique keys (:issue:`65160`) - Performance improvement in :func:`merge` with ``sort=False`` for single-key ``how="left"``/``how="right"`` joins when the opposite join key is sorted, unique, and range-like (:issue:`64146`) - Performance improvement in :func:`read_csv` with ``engine="c"`` when reading from binary file-like objects (e.g. PyArrow S3 file handles) by avoiding unnecessary ``TextIOWrapper`` wrapping (:issue:`46823`) +- Performance improvement in :func:`read_hdf` for the fixed (default) format, especially on large frames (:issue:`47726`) - Performance improvement in :func:`read_html` and the Python CSV parser when ``thousands`` is set, fixing catastrophic regex backtracking on cells with many comma-separated digit groups followed by non-numeric text (:issue:`52619`) - Performance improvement in :func:`read_sas` by reading page header fields directly in Cython instead of falling back to Python (:issue:`47339`) - Performance improvement in :func:`read_sas` for SAS7BDAT files by pre-computing date/datetime column classification once during metadata parsing instead of per chunk (:issue:`47339`) @@ -172,6 +173,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.sort_values` with multiple numeric columns by avoiding unnecessary :class:`Categorical` conversion (:issue:`15389`) - Performance improvement in :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.mean`, :meth:`DataFrame.any`, and :meth:`DataFrame.all` with ``axis=1`` for multi-block DataFrames by avoiding a transpose (:issue:`51474`) - Performance improvement in :meth:`DataFrame.to_excel` with the ``openpyxl`` engine when using ``engine_kwargs={"write_only": True}``, reducing memory consumption (:issue:`41681`) +- Performance improvement in :meth:`DataFrame.to_hdf` and :meth:`HDFStore.append` for table format when appending to an existing wide table with many ``data_columns`` (:issue:`25839`) - Performance improvement in :meth:`DataFrame.to_stata` when writing object-dtype datetime columns with date formats that require year/month extraction (:issue:`64555`) - Performance improvement in :meth:`DataFrame.unstack` and :meth:`Series.unstack` when the :class:`MultiIndex` is already sorted and the unstacked level is the last level (:issue:`65107`) - Performance improvement in :meth:`DatetimeIndex.month_name` and :meth:`DatetimeIndex.day_name` when using the default string dtype by using PyArrow compute instead of going through an intermediate object array (:issue:`65104`) @@ -179,6 +181,7 @@ Performance improvements - Performance improvement in :meth:`GroupBy.any` and :meth:`GroupBy.all` for boolean-dtype columns (:issue:`37850`) - Performance improvement in :meth:`GroupBy.first` and :meth:`GroupBy.last` for Extension Array dtypes, which no longer fall back to a slow ``apply``-based implementation (:issue:`57591`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`64330`) +- Performance improvement in :meth:`HDFStore.select_as_multiple` when no ``where`` clause is given, by avoiding a coordinate-based read (:issue:`26771`) - Performance improvement in :meth:`Index.get_indexer` for large monotonic indexes, which now uses binary search instead of building a hash table when the number of targets is small (:issue:`14273`) - Performance improvement in :meth:`Index.join` and :meth:`Index.union` for :class:`RangeIndex` by avoiding unnecessary memory allocation in the libjoin fastpath (:issue:`54646`) - Performance improvement in :meth:`IntervalIndex.get_indexer` for monotonic non-overlapping indexes, which now uses binary search instead of the interval tree (:issue:`47614`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 03da842157e1e..4870f9627460f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -255,6 +255,29 @@ def _ensure_term(where, scope_level: int): _table_file_open_policy_is_strict = False +_MISSING = object() + + +def _set_attr_if_changed(attrs, name: str, value) -> None: + """ + setattr on a PyTables AttributeSet only if the on-disk value differs. + + Re-writing an HDF5 attribute to the same value is expensive — pytables + deletes and re-creates it, hitting the disk per attribute. On wide-table + appends this dominates runtime (GH#25839). + """ + current = getattr(attrs, name, _MISSING) + if current is _MISSING: + setattr(attrs, name, value) + return + try: + equal = bool(current == value) + except (ValueError, TypeError): + equal = False + if not equal: + setattr(attrs, name, value) + + def _tables(): global _table_mod global _table_file_open_policy_is_strict @@ -2228,7 +2251,10 @@ def get_result(self, coordinates: bool = False): return self # if specified read via coordinates (necessary for multiple selections - if coordinates: + # so each table reads the same row set). Skip when self.where is None + # since every row would be selected anyway, and a coordinate-based read + # is much slower than a sequential read (GH#26771). + if coordinates and self.where is not None: if not isinstance(self.s, Table): raise TypeError("can only read_coordinates on a table") where = self.s.read_coordinates( @@ -2533,7 +2559,7 @@ def set_info(self, info) -> None: def set_attr(self) -> None: """set the kind for this column""" - setattr(self.attrs, self.kind_attr, self.kind) + _set_attr_if_changed(self.attrs, self.kind_attr, self.kind) def validate_metadata(self, handler: AppendableTable) -> None: """validate that kind=category does not change the categories""" @@ -2888,10 +2914,10 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): def set_attr(self) -> None: """set the data for this column""" - setattr(self.attrs, self.kind_attr, self.values) - setattr(self.attrs, self.meta_attr, self.meta) + _set_attr_if_changed(self.attrs, self.kind_attr, self.values) + _set_attr_if_changed(self.attrs, self.meta_attr, self.meta) assert self.dtype is not None - setattr(self.attrs, self.dtype_attr, self.dtype) + _set_attr_if_changed(self.attrs, self.dtype_attr, self.dtype) class DataIndexableCol(DataCol): @@ -3633,7 +3659,7 @@ def read( dfs.append(df) if len(dfs) > 0: - out = concat(dfs, axis=1).copy() + out = concat(dfs, axis=1) return out.reindex(columns=items) return DataFrame(columns=axes[0], index=axes[1])