Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ I/O
- Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`)
- :meth:`DataFrame.to_hdf` now raises a clear :class:`NotImplementedError` when writing a column or :class:`Index` of an unsupported extension dtype (such as :class:`IntervalDtype`, :class:`SparseDtype`, or the nullable integer/float/boolean dtypes), instead of a low-level ``AttributeError`` or PyTables ``TypeError`` (:issue:`26144`, :issue:`38305`, :issue:`42070`)
- Fixed ``MemoryError`` in :meth:`HDFStore.select` when iterating large tables with ``chunksize`` and no ``where`` filter (:issue:`15937`)
- Fixed bug in :func:`read_hdf` where the literal string ``"nan"`` in a string :class:`Index` was incorrectly converted to ``NaN`` on read, even when a custom ``nan_rep`` was supplied (:issue:`9604`)
- Fixed bug in :meth:`DataFrame.to_hdf` with ``format="table"`` where a :class:`TimedeltaIndex` was reconstructed as a :class:`PeriodIndex` (when ``freq`` was set) or an integer :class:`Index` (otherwise) on read-back (:issue:`21466`)
- Fixed bug in :meth:`HDFStore.select` where passing ``where`` as a list of conditions referencing caller-scope variables failed on Python 3.12+ due to :pep:`709` inlining list comprehension stack frames (:issue:`64881`)
- Storing a :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` level named ``'index'`` via :meth:`HDFStore.put` or :meth:`HDFStore.append` with ``format='table'`` now raises a clear ``ValueError`` instead of an opaque reshape error (:issue:`6208`)
Expand Down
18 changes: 12 additions & 6 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2901,8 +2901,13 @@ def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):

# convert nans / decode
if kind == "string":
# Old files may have been written without nan_rep persisted; the
# writer (write_data) defaulted None to "nan", so do the same here.
converted = _unconvert_string_array(
converted, nan_rep=nan_rep, encoding=encoding, errors=errors
converted,
nan_rep=nan_rep if nan_rep is not None else "nan",
encoding=encoding,
errors=errors,
)

return self.values, converted
Expand Down Expand Up @@ -5543,7 +5548,10 @@ def _unconvert_string_array(
Parameters
----------
data : np.ndarray[fixed-length-string]
nan_rep : the storage repr of NaN
nan_rep : the storage repr of NaN, or None to skip substitution.
Pass None when the writer did not encode NaN as a sentinel string
(e.g. for string indices); otherwise legitimate occurrences of the
sentinel value would be incorrectly replaced with NaN on read.
encoding : str
errors : str
Handler for encoding errors.
Expand All @@ -5569,10 +5577,8 @@ def _unconvert_string_array(
else:
data = data.astype(dtype, copy=False).astype(object, copy=False)

if nan_rep is None:
nan_rep = "nan"

libwriters.string_array_replace_from_nan_rep(data, nan_rep)
if nan_rep is not None:
libwriters.string_array_replace_from_nan_rep(data, nan_rep)
return data.reshape(shape)


Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/io/pytables/test_round_trip.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,57 @@ def test_long_strings(temp_hdfstore):
tm.assert_frame_equal(df, result)


def test_string_nan_in_index_fixed(temp_h5_path):
# GH#9604 — literal "nan" string in a string Index was being converted to
# NaN on read because the unconverter applied a NaN-sentinel substitution
# that the writer never performed for indices.
words = ["nan", "kai", "institute", "of", "technology"]
ser = Series(range(len(words)), index=words)

ser.to_hdf(temp_h5_path, key="s", mode="w")
result = read_hdf(temp_h5_path, "s")

tm.assert_series_equal(result, ser)


def test_string_nan_in_index_table(temp_hdfstore):
# GH#9604 — same bug, table format. Also verifies that nan_rep is honored
# for the index (it was previously silently ignored on the index read).
words = ["nan", "kai", "institute", "of", "technology"]
ser = Series(range(len(words)), index=words)

temp_hdfstore.append("s", ser, nan_rep="_nan_")
result = temp_hdfstore.select("s")

tm.assert_series_equal(result, ser)


def test_string_nan_in_dataframe_index(temp_h5_path):
# GH#9604 — DataFrame index with literal "nan" strings, both formats.
df = DataFrame(
{"a": [1, 2, 3, 4]},
index=Index(["nan", "kai", "institute", "of"], name="ix"),
)

df.to_hdf(temp_h5_path, key="fixed", mode="w")
tm.assert_frame_equal(read_hdf(temp_h5_path, "fixed"), df)

df.to_hdf(temp_h5_path, key="table", mode="a", format="table")
tm.assert_frame_equal(read_hdf(temp_h5_path, "table"), df)


def test_string_column_literal_nan_and_real_nan(temp_hdfstore):
# GH#9604 — companion to the index tests: ensure the symmetric nan_rep
# substitution still works on a data column, i.e. a custom nan_rep lets
# both a literal "nan" string and an actual NaN round-trip correctly.
df = DataFrame({"a": ["x", "nan", np.nan, "y"]}, index=["i1", "i2", "i3", "i4"])

temp_hdfstore.append("df", df, nan_rep="_NA_")
result = temp_hdfstore.select("df")

tm.assert_frame_equal(result, df)


def test_api(temp_h5_path):
# GH4584
# API issue when to_hdf doesn't accept append AND format args
Expand Down
Loading