Merge remote-tracking branch 'upstream/main' into Solution-for-issue-#…

…60044-by-ZKaoChi
pandas-dev · Oct 16, 2024 · 5910018 · 5910018
2 parents aeb73af + 2a10e04
commit 5910018
Show file tree

Hide file tree

Showing 58 changed files with 410 additions and 258 deletions.
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -156,7 +156,7 @@ jobs:
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.20.0
+        uses: pypa/cibuildwheel@v2.21.0
         with:
          package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -74,7 +74,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.16.0
+    rev: v3.17.0
     hooks:
     -   id: pyupgrade
         args: [--py310-plus]
@@ -112,7 +112,7 @@ repos:
         types: [python]
         stages: [manual]
         additional_dependencies: &pyright_dependencies
-        - [email protected].352
+        - [email protected].383
     -   id: pyright
         # note: assumes python env is setup and activated
         name: pyright reportGeneralTypeIssues

diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
@@ -133,9 +133,9 @@ API changes
     to be inserted if ``True``, default is ``False`` (same as prior to 0.12) (:issue:`3679`)
   - Implement ``__nonzero__`` for ``NDFrame`` objects (:issue:`3691`, :issue:`3696`)
 
-  - IO api
+  - IO API
 
-    - added top-level function ``read_excel`` to replace the following,
+    - Added top-level function ``read_excel`` to replace the following,
       The original API is deprecated and will be removed in a future version
 
       .. code-block:: python
@@ -153,7 +153,7 @@ API changes
 
          pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"])
 
-    - added top-level function ``read_sql`` that is equivalent to the following
+    - Added top-level function ``read_sql`` that is equivalent to the following
 
       .. code-block:: python
 
@@ -482,11 +482,11 @@ Bug fixes
 
   - ``HDFStore``
 
-    - will retain index attributes (freq,tz,name) on recreation (:issue:`3499`)
-    - will warn with a ``AttributeConflictWarning`` if you are attempting to append
+    - Will retain index attributes (freq,tz,name) on recreation (:issue:`3499`)
+    - Will warn with a ``AttributeConflictWarning`` if you are attempting to append
       an index with a different frequency than the existing, or attempting
       to append an index with a different name than the existing
-    - support datelike columns with a timezone as data_columns (:issue:`2852`)
+    - Support datelike columns with a timezone as data_columns (:issue:`2852`)
 
   - Non-unique index support clarified (:issue:`3468`).
 

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -16,12 +16,12 @@ Enhancements
 
 .. _whatsnew_300.enhancements.enhancement1:
 
-enhancement1
+Enhancement1
 ^^^^^^^^^^^^
 
 .. _whatsnew_300.enhancements.enhancement2:
 
-enhancement2
+Enhancement2
 ^^^^^^^^^^^^
 
 .. _whatsnew_300.enhancements.other:

diff --git a/environment.yml b/environment.yml
@@ -76,10 +76,10 @@ dependencies:
   - cxx-compiler
 
   # code checks
-  - flake8=6.1.0  # run in subprocess over docstring examples
-  - mypy=1.9.0  # pre-commit uses locally installed mypy
+  - flake8=7.1.0  # run in subprocess over docstring examples
+  - mypy=1.11.2  # pre-commit uses locally installed mypy
   - tokenize-rt  # scripts/check_for_inconsistent_pandas_namespace.py
-  - pre-commit>=3.6.0
+  - pre-commit>=4.0.1
 
   # documentation
   - gitpython  # obtain contributors from git for whatsnew

diff --git a/pandas/_config/config.py b/pandas/_config/config.py
@@ -411,7 +411,7 @@ def __dir__(self) -> list[str]:
 
 
 @contextmanager
-def option_context(*args) -> Generator[None, None, None]:
+def option_context(*args) -> Generator[None]:
     """
     Context manager to temporarily set options in a ``with`` statement.
 
@@ -718,7 +718,7 @@ def _build_option_description(k: str) -> str:
 
 
 @contextmanager
-def config_prefix(prefix: str) -> Generator[None, None, None]:
+def config_prefix(prefix: str) -> Generator[None]:
     """
     contextmanager for multiple invocations of API with a common prefix
 

diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py
@@ -25,7 +25,7 @@
 @contextmanager
 def set_locale(
     new_locale: str | tuple[str, str], lc_var: int = locale.LC_ALL
-) -> Generator[str | tuple[str, str], None, None]:
+) -> Generator[str | tuple[str, str]]:
     """
     Context manager for temporarily setting a locale.
 

diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py
@@ -35,7 +35,7 @@ def assert_produces_warning(
     raise_on_extra_warnings: bool = True,
     match: str | tuple[str | None, ...] | None = None,
     must_find_all_warnings: bool = True,
-) -> Generator[list[warnings.WarningMessage], None, None]:
+) -> Generator[list[warnings.WarningMessage]]:
     """
     Context manager for running code expected to either raise a specific warning,
     multiple specific warnings, or not raise any warnings. Verifies that the code

diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
@@ -29,7 +29,7 @@
 @contextmanager
 def decompress_file(
     path: FilePath | BaseBuffer, compression: CompressionOptions
-) -> Generator[IO[bytes], None, None]:
+) -> Generator[IO[bytes]]:
     """
     Open a compressed file and return a file object.
 
@@ -50,7 +50,7 @@ def decompress_file(
 
 
 @contextmanager
-def set_timezone(tz: str) -> Generator[None, None, None]:
+def set_timezone(tz: str) -> Generator[None]:
     """
     Context manager for temporarily setting a timezone.
 
@@ -92,7 +92,7 @@ def setTZ(tz) -> None:
 
 
 @contextmanager
-def ensure_clean(filename=None) -> Generator[Any, None, None]:
+def ensure_clean(filename=None) -> Generator[Any]:
     """
     Gets a temporary path and agrees to remove on close.
 
@@ -124,7 +124,7 @@ def ensure_clean(filename=None) -> Generator[Any, None, None]:
 
 
 @contextmanager
-def with_csv_dialect(name: str, **kwargs) -> Generator[None, None, None]:
+def with_csv_dialect(name: str, **kwargs) -> Generator[None]:
     """
     Context manager to temporarily register a CSV dialect for parsing CSV.
 

diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -131,7 +131,7 @@ def loads(
 
 
 @contextlib.contextmanager
-def patch_pickle() -> Generator[None, None, None]:
+def patch_pickle() -> Generator[None]:
     """
     Temporarily patch pickle to use our unpickler.
     """

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -246,12 +246,8 @@ def transform(self) -> DataFrame | Series:
             and not obj.empty
         ):
             raise ValueError("Transform function failed")
-        # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type
-        # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy,
-        # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame,
-        # Series]"
         if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
-            obj.index  # type: ignore[arg-type]
+            obj.index
         ):
             raise ValueError("Function did not transform")
 
@@ -803,7 +799,7 @@ def result_columns(self) -> Index:
 
     @property
     @abc.abstractmethod
-    def series_generator(self) -> Generator[Series, None, None]:
+    def series_generator(self) -> Generator[Series]:
         pass
 
     @staticmethod
@@ -1128,7 +1124,7 @@ class FrameRowApply(FrameApply):
     axis: AxisInt = 0
 
     @property
-    def series_generator(self) -> Generator[Series, None, None]:
+    def series_generator(self) -> Generator[Series]:
         return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
 
     @staticmethod
@@ -1235,7 +1231,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
         return result.T
 
     @property
-    def series_generator(self) -> Generator[Series, None, None]:
+    def series_generator(self) -> Generator[Series]:
         values = self.values
         values = ensure_wrapped_if_datetimelike(values)
         assert len(values) > 0

diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py
@@ -403,12 +403,12 @@ def _reconstruct(result):
             # for np.<ufunc>(..) calls
             # kwargs cannot necessarily be handled block-by-block, so only
             # take this path if there are no kwargs
-            mgr = inputs[0]._mgr
+            mgr = inputs[0]._mgr  # pyright: ignore[reportGeneralTypeIssues]
             result = mgr.apply(getattr(ufunc, method))
         else:
             # otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
             # Those can have an axis keyword and thus can't be called block-by-block
-            result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)
+            result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)  # pyright: ignore[reportGeneralTypeIssues]
             # e.g. np.negative (only one reached), with "where" and "out" in kwargs
 
     result = reconstruct(result)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
@@ -10,15 +10,14 @@
 
 import numpy as np
 
+from pandas._libs import lib
 from pandas.compat import (
     pa_version_under10p1,
     pa_version_under11p0,
     pa_version_under13p0,
     pa_version_under17p0,
 )
 
-from pandas.core.dtypes.missing import isna
-
 if not pa_version_under10p1:
     import pyarrow as pa
     import pyarrow.compute as pc
@@ -38,7 +37,7 @@ class ArrowStringArrayMixin:
     def __init__(self, *args, **kwargs) -> None:
         raise NotImplementedError
 
-    def _convert_bool_result(self, result):
+    def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
         # Convert a bool-dtype result to the appropriate result type
         raise NotImplementedError
 
@@ -212,7 +211,9 @@ def _str_removesuffix(self, suffix: str):
         result = pc.if_else(ends_with, removed, self._pa_array)
         return type(self)(result)
 
-    def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
+    def _str_startswith(
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
+    ):
         if isinstance(pat, str):
             result = pc.starts_with(self._pa_array, pattern=pat)
         else:
@@ -225,11 +226,11 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
-        if not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
-        return self._convert_bool_result(result)
+        return self._convert_bool_result(result, na=na, method_name="startswith")
 
-    def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
+    def _str_endswith(
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
+    ):
         if isinstance(pat, str):
             result = pc.ends_with(self._pa_array, pattern=pat)
         else:
@@ -242,9 +243,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
-        if not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
-        return self._convert_bool_result(result)
+        return self._convert_bool_result(result, na=na, method_name="endswith")
 
     def _str_isalnum(self):
         result = pc.utf8_is_alnum(self._pa_array)
@@ -283,7 +282,12 @@ def _str_isupper(self):
         return self._convert_bool_result(result)
 
     def _str_contains(
-        self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
+        regex: bool = True,
     ):
         if flags:
             raise NotImplementedError(f"contains not implemented with {flags=}")
@@ -293,19 +297,25 @@ def _str_contains(
         else:
             pa_contains = pc.match_substring
         result = pa_contains(self._pa_array, pat, ignore_case=not case)
-        if not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
-        return self._convert_bool_result(result)
+        return self._convert_bool_result(result, na=na, method_name="contains")
 
     def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not pat.startswith("^"):
             pat = f"^{pat}"
         return self._str_contains(pat, case, flags, na, regex=True)
 
     def _str_fullmatch(
-        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not pat.endswith("$") or pat.endswith("\\$"):
             pat = f"{pat}$"

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -2318,7 +2318,9 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
             for chunk in self._pa_array.iterchunks()
         ]
 
-    def _convert_bool_result(self, result):
+    def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
+        if na is not lib.no_default and not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
+            result = result.fill_null(na)
         return type(self)(result)
 
     def _convert_int_result(self, result):
@@ -2426,7 +2428,7 @@ def _str_rindex(self, sub: str, start: int = 0, end: int | None = None) -> Self:
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_normalize(self, form: str) -> Self:
+    def _str_normalize(self, form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> Self:
         predicate = lambda val: unicodedata.normalize(form, val)
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -369,7 +369,7 @@ def _coerce_to_array(
             assert dtype == "boolean"
         return coerce_to_array(value, copy=copy)
 
-    def _logical_method(self, other, op):
+    def _logical_method(self, other, op):  # type: ignore[override]
         assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
         other_is_scalar = lib.is_scalar(other)
         mask = None

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2679,16 +2679,28 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
     # ------------------------------------------------------------------------
     # String methods interface
     def _str_map(
-        self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
+        self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True
     ):
         # Optimization to apply the callable `f` to the categories once
         # and rebuild the result by `take`ing from the result with the codes.
         # Returns the same type as the object-dtype implementation though.
-        from pandas.core.arrays import NumpyExtensionArray
-
         categories = self.categories
         codes = self.codes
-        result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
+        if categories.dtype == "string":
+            result = categories.array._str_map(f, na_value, dtype)  # type: ignore[attr-defined]
+            if (
+                categories.dtype.na_value is np.nan  # type: ignore[union-attr]
+                and is_bool_dtype(dtype)
+                and (na_value is lib.no_default or isna(na_value))
+            ):
+                # NaN propagates as False for functions with boolean return type
+                na_value = False
+        else:
+            from pandas.core.arrays import NumpyExtensionArray
+
+            result = NumpyExtensionArray(categories.to_numpy())._str_map(
+                f, na_value, dtype
+            )
         return take_nd(result, codes, fill_value=na_value)
 
     def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):