Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: add tests exposing some issues with datetimes with arrow? #486

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 53 additions & 5 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import contextlib
import locale
import time
import warnings
from datetime import datetime
from io import BytesIO
from zipfile import ZipFile

import numpy as np
import pytz

from pyogrio import (
__gdal_version__,
Expand Down Expand Up @@ -298,7 +300,7 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow):
write_dataframe(df, fpath, use_arrow=use_arrow)
df_read = read_dataframe(fpath, use_arrow=use_arrow)
if use_arrow:
# with Arrow, the datetimes are always read as UTC
# with Arrow, the datetimes are always read as UTC for .gpkg
expected = expected.dt.tz_convert("UTC")
assert_series_equal(df_read.datetime_col, expected)

Expand Down Expand Up @@ -328,11 +330,38 @@ def test_write_datetime_mixed_offset(tmp_path, use_arrow):
assert_series_equal(result["dates"], utc_col)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.filterwarnings(
"ignore: Non-conformant content for record 1 in column dates"
)
@pytest.mark.requires_arrow_write_api
def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow):
def test_read_write_datetime_no_tz(tmp_path, ext, use_arrow):
dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"]
if PANDAS_GE_20:
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
else:
dates = pd.to_datetime(dates_raw)
df = gp.GeoDataFrame(
{"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]},
crs="EPSG:4326",
)
fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
result = read_dataframe(fpath, use_arrow=use_arrow)
if use_arrow and ext == ".gpkg":
# for GPKG with Arrow, the datetime is written as naive datetime with the
# correct times, but when read the naive time is assumed to be UTC, which
# changes the effective time so this seems wrong.
df["dates"] = df["dates"].dt.tz_localize(time.timezone).dt.tz_convert("UTC")
assert_geodataframe_equal(df, result)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.filterwarnings(
"ignore: Non-conformant content for record 1 in column dates"
)
@pytest.mark.requires_arrow_write_api
def test_read_write_datetime_tz_with_nulls(tmp_path, ext, use_arrow):
dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", pd.NaT]
if PANDAS_GE_20:
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
Expand All @@ -342,12 +371,31 @@ def test_read_write_datetime_tz_with_nulls(tmp_path, use_arrow):
{"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]},
crs="EPSG:4326",
)
fpath = tmp_path / "test.gpkg"
fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
result = read_dataframe(fpath, use_arrow=use_arrow)
if use_arrow:
# with Arrow, the datetimes are always read as UTC
df["dates"] = df["dates"].dt.tz_convert("UTC")
if ext == ".fgb":
# when FlatGeoBuffer is read with Arrow, for datetimes with equal timezone,
# a column type with the appropriate minutes offset is returned.
# REMARK: For .fgb, the timezone is just dropped when reading or writing!!!
# -> 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T09:00:00.123
df["dates"] = df["dates"].dt.tz_localize(tz=None)
elif ext in (".geojson", ".geojsonl"):
# when GeoJSON is read with Arrow, for datetimes with equal timezone, a
# column type with the appropriate minutes offset is returned.
# REMARK: for .geojson, the data is written fine, but when reading it goes
# wrong: 2020-01-01T09:00:00.123-05:00 becomes 2020-01-01T04:00:00.123-05:00
df["dates"] = (
df["dates"]
.dt.tz_localize(tz=None)
.dt.tz_localize(tz="UTC")
.dt.tz_convert(pytz.FixedOffset(-300))
)
elif ext == ".gpkg":
# when GPKG is read with Arrow, datetimes with timezone are converted to
# UTC.
df["dates"] = df["dates"].dt.tz_convert("UTC")
assert_geodataframe_equal(df, result)


Expand Down
Loading