Merge branch 'main' into issue#57111_10

pandas-dev · Feb 13, 2024 · 84657e5 · 84657e5
2 parents fe8d088 + 1d7aedc
commit 84657e5
Show file tree

Hide file tree

Showing 96 changed files with 1,416 additions and 2,384 deletions.
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -86,7 +86,7 @@ jobs:
       if: ${{ steps.build.outcome == 'success' && always() }}
 
     - name: Typing + pylint
-      uses: pre-commit/[email protected].0
+      uses: pre-commit/[email protected].1
       with:
         extra_args: --verbose --hook-stage manual --all-files
       if: ${{ steps.build.outcome == 'success' && always() }}

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -159,12 +159,6 @@ def setup(self):
 
     def time_items(self):
         # (monitor no-copying behaviour)
-        if hasattr(self.df, "_item_cache"):
-            self.df._item_cache.clear()
-        for name, col in self.df.items():
-            pass
-
-    def time_items_cached(self):
         for name, col in self.df.items():
             pass
 

diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
@@ -22,23 +22,20 @@
 
 
 class ToNumeric:
-    params = ["ignore", "coerce"]
-    param_names = ["errors"]
-
-    def setup(self, errors):
+    def setup(self):
         N = 10000
         self.float = Series(np.random.randn(N))
         self.numstr = self.float.astype("str")
         self.str = Series(Index([f"i-{i}" for i in range(N)], dtype=object))
 
-    def time_from_float(self, errors):
-        to_numeric(self.float, errors=errors)
+    def time_from_float(self):
+        to_numeric(self.float, errors="coerce")
 
-    def time_from_numeric_str(self, errors):
-        to_numeric(self.numstr, errors=errors)
+    def time_from_numeric_str(self):
+        to_numeric(self.numstr, errors="coerce")
 
-    def time_from_str(self, errors):
-        to_numeric(self.str, errors=errors)
+    def time_from_str(self):
+        to_numeric(self.str, errors="coerce")
 
 
 class ToNumericDowncast:
@@ -187,7 +184,7 @@ def time_iso8601_tz_spaceformat(self):
 
     def time_iso8601_infer_zero_tz_fromat(self):
         # GH 41047
-        to_datetime(self.strings_zero_tz, infer_datetime_format=True)
+        to_datetime(self.strings_zero_tz)
 
 
 class ToDatetimeNONISO8601:
@@ -271,16 +268,6 @@ def time_dup_string_tzoffset_dates(self, cache):
         to_datetime(self.dup_string_with_tz, cache=cache)
 
 
-# GH 43901
-class ToDatetimeInferDatetimeFormat:
-    def setup(self):
-        rng = date_range(start="1/1/2000", periods=100000, freq="h")
-        self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
-
-    def time_infer_datetime_format(self):
-        to_datetime(self.strings, infer_datetime_format=True)
-
-
 class ToTimedelta:
     def setup(self):
         self.ints = np.random.randint(0, 60, size=10000)
@@ -301,16 +288,13 @@ def time_convert_string_seconds(self):
 
 
 class ToTimedeltaErrors:
-    params = ["coerce", "ignore"]
-    param_names = ["errors"]
-
-    def setup(self, errors):
+    def setup(self):
         ints = np.random.randint(0, 60, size=10000)
         self.arr = [f"{i} days" for i in ints]
         self.arr[-1] = "apple"
 
-    def time_convert(self, errors):
-        to_timedelta(self.arr, errors=errors)
+    def time_convert(self):
+        to_timedelta(self.arr, errors="coerce")
 
 
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -136,13 +136,74 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         pandas.Interval\
         pandas.Grouper\
         pandas.core.groupby.DataFrameGroupBy.nth\
-        pandas.core.groupby.DataFrameGroupBy.rolling\
         pandas.core.groupby.SeriesGroupBy.nth\
-        pandas.core.groupby.SeriesGroupBy.rolling\
         pandas.core.groupby.DataFrameGroupBy.plot\
         pandas.core.groupby.SeriesGroupBy.plot # There should be no backslash in the final line, please keep this comment in the last ignored function
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Partially validate docstrings (SA05)' ;  echo $MSG
+    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=SA05 --ignore_functions \
+        pandas.DataFrame.agg\
+        pandas.DataFrame.aggregate\
+        pandas.DataFrame.boxplot\
+        pandas.PeriodIndex.asfreq\
+        pandas.arrays.ArrowStringArray\
+        pandas.arrays.StringArray\
+        pandas.core.groupby.DataFrameGroupBy.first\
+        pandas.core.groupby.DataFrameGroupBy.last\
+        pandas.core.groupby.SeriesGroupBy.first\
+        pandas.core.groupby.SeriesGroupBy.last\
+        pandas.core.resample.Resampler.first\
+        pandas.core.resample.Resampler.last\
+        pandas.core.window.ewm.ExponentialMovingWindow.corr\
+        pandas.core.window.ewm.ExponentialMovingWindow.cov\
+        pandas.core.window.ewm.ExponentialMovingWindow.mean\
+        pandas.core.window.ewm.ExponentialMovingWindow.std\
+        pandas.core.window.ewm.ExponentialMovingWindow.sum\
+        pandas.core.window.ewm.ExponentialMovingWindow.var\
+        pandas.core.window.expanding.Expanding.aggregate\
+        pandas.core.window.expanding.Expanding.apply\
+        pandas.core.window.expanding.Expanding.corr\
+        pandas.core.window.expanding.Expanding.count\
+        pandas.core.window.expanding.Expanding.cov\
+        pandas.core.window.expanding.Expanding.kurt\
+        pandas.core.window.expanding.Expanding.max\
+        pandas.core.window.expanding.Expanding.mean\
+        pandas.core.window.expanding.Expanding.median\
+        pandas.core.window.expanding.Expanding.min\
+        pandas.core.window.expanding.Expanding.quantile\
+        pandas.core.window.expanding.Expanding.rank\
+        pandas.core.window.expanding.Expanding.sem\
+        pandas.core.window.expanding.Expanding.skew\
+        pandas.core.window.expanding.Expanding.std\
+        pandas.core.window.expanding.Expanding.sum\
+        pandas.core.window.expanding.Expanding.var\
+        pandas.core.window.rolling.Rolling.aggregate\
+        pandas.core.window.rolling.Rolling.apply\
+        pandas.core.window.rolling.Rolling.corr\
+        pandas.core.window.rolling.Rolling.count\
+        pandas.core.window.rolling.Rolling.cov\
+        pandas.core.window.rolling.Rolling.kurt\
+        pandas.core.window.rolling.Rolling.max\
+        pandas.core.window.rolling.Rolling.mean\
+        pandas.core.window.rolling.Rolling.median\
+        pandas.core.window.rolling.Rolling.min\
+        pandas.core.window.rolling.Rolling.quantile\
+        pandas.core.window.rolling.Rolling.rank\
+        pandas.core.window.rolling.Rolling.sem\
+        pandas.core.window.rolling.Rolling.skew\
+        pandas.core.window.rolling.Rolling.std\
+        pandas.core.window.rolling.Rolling.sum\
+        pandas.core.window.rolling.Rolling.var\
+        pandas.core.window.rolling.Window.mean\
+        pandas.core.window.rolling.Window.std\
+        pandas.core.window.rolling.Window.sum\
+        pandas.core.window.rolling.Window.var\
+        pandas.plotting.bootstrap_plot\
+        pandas.plotting.boxplot\
+        pandas.plotting.radviz # There should be no backslash in the final line, please keep this comment in the last ignored function
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
 fi
 
 ### DOCUMENTATION NOTEBOOKS ###

diff --git a/doc/redirects.csv b/doc/redirects.csv
@@ -238,7 +238,6 @@ generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.r
 generated/pandas.core.resample.Resampler.bfill,../reference/api/pandas.core.resample.Resampler.bfill
 generated/pandas.core.resample.Resampler.count,../reference/api/pandas.core.resample.Resampler.count
 generated/pandas.core.resample.Resampler.ffill,../reference/api/pandas.core.resample.Resampler.ffill
-generated/pandas.core.resample.Resampler.fillna,../reference/api/pandas.core.resample.Resampler.fillna
 generated/pandas.core.resample.Resampler.first,../reference/api/pandas.core.resample.Resampler.first
 generated/pandas.core.resample.Resampler.get_group,../reference/api/pandas.core.resample.Resampler.get_group
 generated/pandas.core.resample.Resampler.groups,../reference/api/pandas.core.resample.Resampler.groups
@@ -481,7 +480,6 @@ generated/pandas.DataFrame.style,../reference/api/pandas.DataFrame.style
 generated/pandas.DataFrame.sub,../reference/api/pandas.DataFrame.sub
 generated/pandas.DataFrame.subtract,../reference/api/pandas.DataFrame.subtract
 generated/pandas.DataFrame.sum,../reference/api/pandas.DataFrame.sum
-generated/pandas.DataFrame.swapaxes,../reference/api/pandas.DataFrame.swapaxes
 generated/pandas.DataFrame.swaplevel,../reference/api/pandas.DataFrame.swaplevel
 generated/pandas.DataFrame.tail,../reference/api/pandas.DataFrame.tail
 generated/pandas.DataFrame.take,../reference/api/pandas.DataFrame.take
@@ -1206,7 +1204,6 @@ generated/pandas.Series.str.zfill,../reference/api/pandas.Series.str.zfill
 generated/pandas.Series.sub,../reference/api/pandas.Series.sub
 generated/pandas.Series.subtract,../reference/api/pandas.Series.subtract
 generated/pandas.Series.sum,../reference/api/pandas.Series.sum
-generated/pandas.Series.swapaxes,../reference/api/pandas.Series.swapaxes
 generated/pandas.Series.swaplevel,../reference/api/pandas.Series.swaplevel
 generated/pandas.Series.tail,../reference/api/pandas.Series.tail
 generated/pandas.Series.take,../reference/api/pandas.Series.take

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -235,7 +235,6 @@ Reshaping, sorting, transposing
    DataFrame.swaplevel
    DataFrame.stack
    DataFrame.unstack
-   DataFrame.swapaxes
    DataFrame.melt
    DataFrame.explode
    DataFrame.squeeze

diff --git a/doc/source/reference/resampling.rst b/doc/source/reference/resampling.rst
@@ -38,7 +38,6 @@ Upsampling
    Resampler.ffill
    Resampler.bfill
    Resampler.nearest
-   Resampler.fillna
    Resampler.asfreq
    Resampler.interpolate
 

diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst
@@ -17,13 +17,15 @@ Fixed regressions
 - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
 - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`)
 - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
+- Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`)
 - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
 - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
 - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
 - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`)
 - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`)
 - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
 - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`)
+- Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`)
 - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`)
 - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`)
 - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -30,6 +30,7 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
 - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
+- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -106,29 +107,36 @@ Removal of prior version deprecations/changes
 - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
 - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
 - Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`)
-- Removed :meth:`DataFrame.applymap`, :meth:`Styler.applymap` and :meth:`Styler.applymap_index` (:issue:`52364`)
+- Removed ``DataFrame.applymap``, ``Styler.applymap`` and ``Styler.applymap_index`` (:issue:`52364`)
 - Removed ``DataFrame.bool`` and ``Series.bool`` (:issue:`51756`)
 - Removed ``DataFrame.first`` and ``DataFrame.last`` (:issue:`53710`)
+- Removed ``DataFrame.swapaxes`` and ``Series.swapaxes`` (:issue:`51946`)
 - Removed ``DataFrameGroupBy.grouper`` and ``SeriesGroupBy.grouper`` (:issue:`56521`)
 - Removed ``DataFrameGroupby.fillna`` and ``SeriesGroupBy.fillna``` (:issue:`55719`)
 - Removed ``Index.format``, use :meth:`Index.astype` with ``str`` or :meth:`Index.map` with a ``formatter`` function instead (:issue:`55439`)
+- Removed ``Resample.fillna`` (:issue:`55719`)
 - Removed ``Series.__int__`` and ``Series.__float__``. Call ``int(Series.iloc[0])`` or ``float(Series.iloc[0])`` instead. (:issue:`51131`)
 - Removed ``Series.ravel`` (:issue:`56053`)
 - Removed ``Series.view`` (:issue:`56054`)
+- Removed ``StataReader.close`` (:issue:`49228`)
 - Removed ``axis`` argument from :meth:`DataFrame.groupby`, :meth:`Series.groupby`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.resample`, and :meth:`Series.resample` (:issue:`51203`)
 - Removed ``axis`` argument from all groupby operations (:issue:`50405`)
+- Removed ``convert_dtype`` from :meth:`Series.apply` (:issue:`52257`)
 - Removed ``pandas.api.types.is_interval`` and ``pandas.api.types.is_period``, use ``isinstance(obj, pd.Interval)`` and ``isinstance(obj, pd.Period)`` instead (:issue:`55264`)
 - Removed ``pandas.io.sql.execute`` (:issue:`50185`)
 - Removed ``pandas.value_counts``, use :meth:`Series.value_counts` instead (:issue:`53493`)
 - Removed ``read_gbq`` and ``DataFrame.to_gbq``. Use ``pandas_gbq.read_gbq`` and ``pandas_gbq.to_gbq`` instead https://pandas-gbq.readthedocs.io/en/latest/api.html (:issue:`55525`)
+- Removed ``use_nullable_dtypes`` from :func:`read_parquet` (:issue:`51853`)
 - Removed deprecated argument ``obj`` in :meth:`.DataFrameGroupBy.get_group` and :meth:`.SeriesGroupBy.get_group` (:issue:`53545`)
 - Removed deprecated behavior of :meth:`Series.agg` using :meth:`Series.apply` (:issue:`53325`)
+- Removed support for ``errors="ignore"`` in :func:`to_datetime`, :func:`to_timedelta` and :func:`to_numeric` (:issue:`55734`)
 - Removed the ``ArrayManager`` (:issue:`55043`)
 - Removed the ``fastpath`` argument from the :class:`Series` constructor (:issue:`55466`)
 - Removed the ``is_boolean``, ``is_integer``, ``is_floating``, ``holds_integer``, ``is_numeric``, ``is_categorical``, ``is_object``, and ``is_interval`` attributes of :class:`Index` (:issue:`50042`)
 - Removed unused arguments ``*args`` and ``**kwargs`` in :class:`Resampler` methods (:issue:`50977`)
 - Unrecognized timezones when parsing strings to datetimes now raises a ``ValueError`` (:issue:`51477`)
 
+
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.performance:
 

diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c
@@ -447,8 +447,15 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
   npyarr->curdim--;
   npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim];
   npyarr->stridedim -= npyarr->inc;
-  npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
-  npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
+
+  if (!PyArray_Check(npyarr->array)) {
+    PyErr_SetString(PyExc_TypeError,
+                    "NpyArrayPassThru_iterEnd received a non-array object");
+    return;
+  }
+  const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array;
+  npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim);
+  npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim);
   npyarr->dataptr += npyarr->stride;
 
   NpyArr_freeItemValue(obj, tc);
@@ -467,12 +474,19 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
 
   NpyArr_freeItemValue(obj, tc);
 
-  if (PyArray_ISDATETIME(npyarr->array)) {
+  if (!PyArray_Check(npyarr->array)) {
+    PyErr_SetString(PyExc_TypeError,
+                    "NpyArr_iterNextItem received a non-array object");
+    return 0;
+  }
+  PyArrayObject *arrayobj = (PyArrayObject *)npyarr->array;
+
+  if (PyArray_ISDATETIME(arrayobj)) {
     GET_TC(tc)->itemValue = obj;
     Py_INCREF(obj);
-    ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
+    ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(arrayobj);
     // Also write the resolution (unit) of the ndarray
-    PyArray_Descr *dtype = PyArray_DESCR(npyarr->array);
+    PyArray_Descr *dtype = PyArray_DESCR(arrayobj);
     ((PyObjectEncoder *)tc->encoder)->valueUnit =
         get_datetime_metadata_from_dtype(dtype).base;
     ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
@@ -505,8 +519,15 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
 
   npyarr->curdim++;
   npyarr->stridedim += npyarr->inc;
-  npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
-  npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
+  if (!PyArray_Check(npyarr->array)) {
+    PyErr_SetString(PyExc_TypeError,
+                    "NpyArr_iterNext received a non-array object");
+    return 0;
+  }
+  const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array;
+
+  npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim);
+  npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim);
   npyarr->index[npyarr->stridedim] = 0;
 
   ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
@@ -1610,7 +1631,14 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
       if (!values) {
         goto INVALID;
       }
-      pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0);
+
+      if (!PyArray_Check(pc->newObj)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Object_beginTypeContext received a non-array object");
+        goto INVALID;
+      }
+      const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj;
+      pc->columnLabelsLen = PyArray_DIM(arrayobj, 0);
       pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
                                              pc->columnLabelsLen);
       if (!pc->columnLabels) {