From 224486925e5fd278963164064cc7b6d559f586e1 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Fri, 25 Oct 2024 00:35:38 -0400 Subject: [PATCH 1/2] Add fix for #59242 --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/internals/construction.py | 12 +++++++++++- pandas/tests/io/test_sql.py | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 01c2ed3821d7a..2880ecec81f66 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -130,7 +130,7 @@ MultiIndex I/O ^^^ -- +- Bug in :func:`read_sql` causing an unintended exception when byte data was being converted to string when using the pyarrow dtype_backend (:issue:`59242`) - Period diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 959e572b2b35b..1900ed282e876 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -970,7 +970,17 @@ def convert(arr): if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() - arr = arr_cls._from_sequence(arr, dtype=new_dtype) + try: + # Addressing (#59242) + # Byte data that could not be decoded into + # a string would throw a UnicodeDecodeError exception + + # Try and greedily convert to string + # Will fail if the object is bytes + arr = arr_cls._from_sequence(arr, dtype=new_dtype) + except UnicodeDecodeError: + pass + elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": arr = pd_array(arr, copy=False) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index c28a33069d23f..69ad44d1a5e73 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4352,3 +4352,17 @@ def test_xsqlite_if_exists(sqlite_buildin): (5, "E"), ] drop_table(table_name, sqlite_buildin) + + +def test_bytes_column(sqlite_buildin): + """ + Regression test for (#59242) + Bytes being returned in a column that could not be converted + to a string would raise a UnicodeDecodeError + when using dtype_backend='pyarrow' + """ + query = """ + select cast(x'0123456789abcdef0123456789abcdef' as blob) a + """ + df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow") + assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" From bd00fc545e25a611d97edecb9aac8c0324d17e90 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Fri, 25 Oct 2024 18:09:53 -0400 Subject: [PATCH 2/2] add skip import --- pandas/tests/io/test_sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 69ad44d1a5e73..73f9ff42287fc 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4355,6 +4355,7 @@ def test_xsqlite_if_exists(sqlite_buildin): def test_bytes_column(sqlite_buildin): + pytest.importorskip("pyarrow") """ Regression test for (#59242) Bytes being returned in a column that could not be converted