diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index a43f1f7ee866d..12e89fa19c6ea 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -342,6 +342,7 @@ I/O - Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`) - :meth:`DataFrame.to_hdf` now raises a clear :class:`NotImplementedError` when writing a column or :class:`Index` of an unsupported extension dtype (such as :class:`IntervalDtype`, :class:`SparseDtype`, or the nullable integer/float/boolean dtypes), instead of a low-level ``AttributeError`` or PyTables ``TypeError`` (:issue:`26144`, :issue:`38305`, :issue:`42070`) - Fixed ``MemoryError`` in :meth:`HDFStore.select` when iterating large tables with ``chunksize`` and no ``where`` filter (:issue:`15937`) +- Fixed bug in :func:`read_hdf` where the literal string ``"nan"`` in a string :class:`Index` was incorrectly converted to ``NaN`` on read, even when a custom ``nan_rep`` was supplied (:issue:`9604`) - Fixed bug in :meth:`DataFrame.to_hdf` with ``format="table"`` where a :class:`TimedeltaIndex` was reconstructed as a :class:`PeriodIndex` (when ``freq`` was set) or an integer :class:`Index` (otherwise) on read-back (:issue:`21466`) - Fixed bug in :meth:`HDFStore.select` where passing ``where`` as a list of conditions referencing caller-scope variables failed on Python 3.12+ due to :pep:`709` inlining list comprehension stack frames (:issue:`64881`) - Storing a :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` level named ``'index'`` via :meth:`HDFStore.put` or :meth:`HDFStore.append` with ``format='table'`` now raises a clear ``ValueError`` instead of an opaque reshape error (:issue:`6208`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 06e9b219a0eba..b38642385dfaf 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2901,8 +2901,13 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): # convert nans / decode if kind == "string": + # Old files may have been written without nan_rep persisted; the + # writer (write_data) defaulted None to "nan", so do the same here. converted = _unconvert_string_array( - converted, nan_rep=nan_rep, encoding=encoding, errors=errors + converted, + nan_rep=nan_rep if nan_rep is not None else "nan", + encoding=encoding, + errors=errors, ) return self.values, converted @@ -5543,7 +5548,10 @@ def _unconvert_string_array( Parameters ---------- data : np.ndarray[fixed-length-string] - nan_rep : the storage repr of NaN + nan_rep : the storage repr of NaN, or None to skip substitution. + Pass None when the writer did not encode NaN as a sentinel string + (e.g. for string indices); otherwise legitimate occurrences of the + sentinel value would be incorrectly replaced with NaN on read. encoding : str errors : str Handler for encoding errors. @@ -5569,10 +5577,8 @@ def _unconvert_string_array( else: data = data.astype(dtype, copy=False).astype(object, copy=False) - if nan_rep is None: - nan_rep = "nan" - - libwriters.string_array_replace_from_nan_rep(data, nan_rep) + if nan_rep is not None: + libwriters.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 7aa57d3d38168..628a229523742 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -61,6 +61,57 @@ def test_long_strings(temp_hdfstore): tm.assert_frame_equal(df, result) +def test_string_nan_in_index_fixed(temp_h5_path): + # GH#9604 — literal "nan" string in a string Index was being converted to + # NaN on read because the unconverter applied a NaN-sentinel substitution + # that the writer never performed for indices. + words = ["nan", "kai", "institute", "of", "technology"] + ser = Series(range(len(words)), index=words) + + ser.to_hdf(temp_h5_path, key="s", mode="w") + result = read_hdf(temp_h5_path, "s") + + tm.assert_series_equal(result, ser) + + +def test_string_nan_in_index_table(temp_hdfstore): + # GH#9604 — same bug, table format. Also verifies that nan_rep is honored + # for the index (it was previously silently ignored on the index read). + words = ["nan", "kai", "institute", "of", "technology"] + ser = Series(range(len(words)), index=words) + + temp_hdfstore.append("s", ser, nan_rep="_nan_") + result = temp_hdfstore.select("s") + + tm.assert_series_equal(result, ser) + + +def test_string_nan_in_dataframe_index(temp_h5_path): + # GH#9604 — DataFrame index with literal "nan" strings, both formats. + df = DataFrame( + {"a": [1, 2, 3, 4]}, + index=Index(["nan", "kai", "institute", "of"], name="ix"), + ) + + df.to_hdf(temp_h5_path, key="fixed", mode="w") + tm.assert_frame_equal(read_hdf(temp_h5_path, "fixed"), df) + + df.to_hdf(temp_h5_path, key="table", mode="a", format="table") + tm.assert_frame_equal(read_hdf(temp_h5_path, "table"), df) + + +def test_string_column_literal_nan_and_real_nan(temp_hdfstore): + # GH#9604 — companion to the index tests: ensure the symmetric nan_rep + # substitution still works on a data column, i.e. a custom nan_rep lets + # both a literal "nan" string and an actual NaN round-trip correctly. + df = DataFrame({"a": ["x", "nan", np.nan, "y"]}, index=["i1", "i2", "i3", "i4"]) + + temp_hdfstore.append("df", df, nan_rep="_NA_") + result = temp_hdfstore.select("df") + + tm.assert_frame_equal(result, df) + + def test_api(temp_h5_path): # GH4584 # API issue when to_hdf doesn't accept append AND format args