Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add use_nullable_dtypes for read_html #50286

Merged
merged 8 commits into from
Dec 27, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,15 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following

* :func:`read_csv`
* :func:`read_excel`
* :func:`read_html`
* :func:`read_sql`

Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
to select the nullable dtypes implementation.

* :func:`read_csv` (with ``engine="pyarrow"``)
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
* :func:`read_excel`
* :func:`read_html`
* :func:`read_parquet`
* :func:`read_orc`

Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1384,6 +1384,7 @@ STR_NA_VALUES = {
"nan",
"-nan",
"",
"None",
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
}
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))

Expand Down
9 changes: 9 additions & 0 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,7 @@ def read_html(
keep_default_na: bool = True,
displayed_only: bool = True,
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
use_nullable_dtypes: bool = False,
) -> list[DataFrame]:
r"""
Read HTML tables into a ``list`` of ``DataFrame`` objects.
Expand Down Expand Up @@ -1143,6 +1144,13 @@ def read_html(

.. versionadded:: 1.5.0

use_nullable_dtypes : bool = False
Whether to use nullable dtypes as default when reading data. If
set to True, nullable dtypes are used for all dtypes that have a nullable
implementation, even if no nulls are present.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add the additional paragraph of mode.dtype_backend being available that other docstrings have? (Should start with The nullable dtype implementation)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thx, added


.. versionadded:: 2.0

Returns
-------
dfs
Expand Down Expand Up @@ -1218,4 +1226,5 @@ def read_html(
keep_default_na=keep_default_na,
displayed_only=displayed_only,
extract_links=extract_links,
use_nullable_dtypes=use_nullable_dtypes,
)
66 changes: 66 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from pandas.compat import is_platform_windows
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
NA,
DataFrame,
MultiIndex,
Series,
Expand All @@ -27,6 +29,10 @@
to_datetime,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
StringArray,
)

from pandas.io.common import file_path_to_url
import pandas.io.html
Expand Down Expand Up @@ -132,6 +138,66 @@ def test_to_html_compat(self):
res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
tm.assert_frame_equal(res, df)

@pytest.mark.parametrize("nullable_backend", ["pandas", "pyarrow"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
@pytest.mark.parametrize("nullable_backend", ["pandas", "pyarrow"])
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])

@pytest.mark.parametrize("storage", ["python", "pyarrow"])
def test_use_nullable_dtypes(self, storage, nullable_backend):
# GH#
df = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
"b": Series([1, 2, 3], dtype="Int64"),
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": [True, False, None],
"f": [True, False, True],
"g": ["a", "b", "c"],
"h": ["a", "b", None],
}
)

string_array: StringArray | ArrowStringArray
string_array_na: StringArray | ArrowStringArray
if storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

else:
pa = pytest.importorskip("pyarrow")
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))

out = df.to_html(index=False)
with pd.option_context("mode.string_storage", storage):
with pd.option_context("mode.nullable_backend", nullable_backend):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with pd.option_context("mode.nullable_backend", nullable_backend):
with pd.option_context("mode.dtype_backend", nullable_backend):

result = self.read_html(out, use_nullable_dtypes=True)[0]

expected = DataFrame(
{
"a": Series([1, np.nan, 3], dtype="Int64"),
"b": Series([1, 2, 3], dtype="Int64"),
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": Series([True, False, NA], dtype="boolean"),
"f": Series([True, False, True], dtype="boolean"),
"g": string_array,
"h": string_array_na,
}
)

if nullable_backend == "pyarrow":
import pyarrow as pa

from pandas.arrays import ArrowExtensionArray

expected = DataFrame(
{
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
for col in expected.columns
}
)

tm.assert_frame_equal(result, expected)

@pytest.mark.network
@tm.network(
url=(
Expand Down