Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clib.conversion: Deal with np.object dtype in vectors_to_arrays and deprecate the array_to_datetime function #3507

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
3661e54
Refactor vectors_to_arrays and deprecate the array_to_datetime function
seisman Oct 11, 2024
20b9215
No need to use pd.api.types.is_string_dtype anymore
seisman Oct 13, 2024
83673cf
Add tests for vectors_to_arrays
seisman Oct 14, 2024
56a0841
Rename test_clib_conversion.py to test_clib_vectors_to_arrays.py
seisman Oct 14, 2024
6338cde
Add one line of code back
seisman Oct 14, 2024
1864556
Explicitly checking array.dtype.type
seisman Oct 14, 2024
54160bf
Correctly skip the tests if pyarrow is not installed
seisman Oct 14, 2024
f4e1a5f
Explicitly specify how to convert pandas/pyarrow string dtype to nump…
seisman Oct 14, 2024
ed3be20
Ensure the resulting numpy dtypes are supported by GMT
seisman Oct 14, 2024
8ab6c6c
Merge branch 'main' into remove/array-to-datetime
seisman Oct 17, 2024
e26afbf
Revert any changes that enhances the conversion process
seisman Oct 17, 2024
be3c93e
Rename array_to_datetime to _array_to_datetime
seisman Oct 17, 2024
3d40687
Merge branch 'main' into remove/array-to-datetime
seisman Oct 19, 2024
8f99a97
Some numpy dtypes like np.float16 can't be recognized by GMT
seisman Oct 19, 2024
5ecf445
Merge branch 'main' into remove/array-to-datetime
seisman Oct 22, 2024
cd75c5c
Merge branch 'main' into remove/array-to-datetime
seisman Oct 23, 2024
91e10a7
Merge branch 'main' into remove/array-to-datetime
seisman Oct 24, 2024
47214e5
Merge branch 'main' into remove/array-to-datetime
seisman Oct 24, 2024
d8777e5
Check three variants for string dtypes
seisman Oct 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions pygmt/clib/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,13 @@ def vectors_to_arrays(vectors: Sequence[Any]) -> list[np.ndarray]:
else:
vec_dtype = str(getattr(vector, "dtype", ""))
array = np.ascontiguousarray(vector, dtype=dtypes.get(vec_dtype))
# Convert np.object_ to np.datetime64 or np.str_.
# If fails, then the array can't be recognized.
if array.dtype.type == np.object_:
try:
array = np.ascontiguousarray(array, dtype=np.datetime64)
except ValueError:
array = np.ascontiguousarray(array, dtype=np.str_)
arrays.append(array)
return arrays

Expand Down Expand Up @@ -273,12 +280,17 @@ def strings_to_ctypes_array(strings: Sequence[str] | np.ndarray) -> ctp.Array:
return (ctp.c_char_p * len(strings))(*[s.encode() for s in strings])


def array_to_datetime(array: Sequence[Any] | np.ndarray) -> np.ndarray:
def _array_to_datetime(array: Sequence[Any] | np.ndarray) -> np.ndarray:
"""
Convert a 1-D datetime array from various types into numpy.datetime64.

If the input array is not in legal datetime formats, raise a ValueError exception.

.. deprecated:: 0.14.0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After this PR, array_to_datetime is no longer used, but I still want to keep this function so that we know what kinds of datetime formats that np.asarray(array, dtype=np.datetime64) can support.


The function is no longer used in the PyGMT project, but we keep this function
to document the supported datetime types.

Parameters
----------
array
Expand Down Expand Up @@ -309,20 +321,20 @@ def array_to_datetime(array: Sequence[Any] | np.ndarray) -> np.ndarray:
... ["2010-06-01", "2011-06-01T12", "2012-01-01T12:34:56"],
... dtype="datetime64[ns]",
... )
>>> array_to_datetime(x)
>>> _array_to_datetime(x)
array(['2010-06-01T00:00:00.000000000', '2011-06-01T12:00:00.000000000',
'2012-01-01T12:34:56.000000000'], dtype='datetime64[ns]')

>>> # pandas.DateTimeIndex array
>>> import pandas as pd
>>> x = pd.date_range("2013", freq="YS", periods=3)
>>> array_to_datetime(x)
>>> _array_to_datetime(x)
array(['2013-01-01T00:00:00.000000000', '2014-01-01T00:00:00.000000000',
'2015-01-01T00:00:00.000000000'], dtype='datetime64[ns]')

>>> # Python's built-in date and datetime
>>> x = [datetime.date(2018, 1, 1), datetime.datetime(2019, 1, 1)]
>>> array_to_datetime(x)
>>> _array_to_datetime(x)
array(['2018-01-01T00:00:00.000000', '2019-01-01T00:00:00.000000'],
dtype='datetime64[us]')

Expand All @@ -333,7 +345,7 @@ def array_to_datetime(array: Sequence[Any] | np.ndarray) -> np.ndarray:
... "2018-03-01",
... "2018-04-01T01:02:03",
... ]
>>> array_to_datetime(x)
>>> _array_to_datetime(x)
array(['2018-01-01T00:00:00', '2018-02-01T00:00:00',
'2018-03-01T00:00:00', '2018-04-01T01:02:03'],
dtype='datetime64[s]')
Expand All @@ -344,7 +356,7 @@ def array_to_datetime(array: Sequence[Any] | np.ndarray) -> np.ndarray:
... np.datetime64("2018-01-01"),
... datetime.datetime(2018, 1, 1),
... ]
>>> array_to_datetime(x)
>>> _array_to_datetime(x)
array(['2018-01-01T00:00:00.000000', '2018-01-01T00:00:00.000000',
'2018-01-01T00:00:00.000000'], dtype='datetime64[us]')
"""
Expand Down
22 changes: 6 additions & 16 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import pandas as pd
import xarray as xr
from pygmt.clib.conversion import (
array_to_datetime,
dataarray_to_matrix,
sequence_to_ctypes_array,
strings_to_ctypes_array,
Expand Down Expand Up @@ -848,22 +847,13 @@ def _check_dtype_and_dim(self, array, ndim):
"""
# Check that the array has the given number of dimensions
if array.ndim != ndim:
raise GMTInvalidInput(
f"Expected a numpy {ndim}-D array, got {array.ndim}-D."
)
msg = f"Expected a numpy {ndim}-D array, got {array.ndim}-D."
raise GMTInvalidInput(msg)

# Check that the array has a valid/known data type
if array.dtype.type not in DTYPES:
try:
if array.dtype.type is np.object_:
# Try to convert unknown object type to np.datetime64
array = array_to_datetime(array)
else:
raise ValueError
except ValueError as e:
raise GMTInvalidInput(
f"Unsupported numpy data type '{array.dtype.type}'."
) from e
msg = f"Unsupported numpy data type '{array.dtype.type}'."
raise GMTInvalidInput(msg)
return self[DTYPES[array.dtype.type]]

def put_vector(self, dataset: ctp.c_void_p, column: int, vector: np.ndarray):
Expand Down Expand Up @@ -910,7 +900,7 @@ def put_vector(self, dataset: ctp.c_void_p, column: int, vector: np.ndarray):
gmt_type = self._check_dtype_and_dim(vector, ndim=1)
if gmt_type in {self["GMT_TEXT"], self["GMT_DATETIME"]}:
if gmt_type == self["GMT_DATETIME"]:
vector = np.datetime_as_string(array_to_datetime(vector))
vector = np.datetime_as_string(vector)
vector_pointer = strings_to_ctypes_array(vector)
else:
vector_pointer = vector.ctypes.data_as(ctp.c_void_p)
Expand Down Expand Up @@ -1400,7 +1390,7 @@ def virtualfile_from_vectors(
# 2 columns contains coordinates like longitude, latitude, or datetime string
# types.
for col, array in enumerate(arrays[2:]):
if pd.api.types.is_string_dtype(array.dtype):
if array.dtype.type == np.str_:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pd.api.types.is_string_dtype was added in PR #684. Before PR #684, we used np.issubdtype(array.dtype, np.str_), which was added in #520. Any specific reason that we should use np.issubdtype(array.dtype, np.str_), not array.dtype.type == np.str_?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'll need to check if this can handle pandas.StringDtype and pyarrow.StringArray (xref #2933).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both can be converted to the numpy string dtype by the vectors_to_arrays method, so in virtualfile_from_vectors, we only need to check np.str_:

In [1]: import numpy as np

In [2]: import pandas as pd

In [3]: import pyarrow as pa

In [4]: x = pd.Series(["abc", "defghi"], dtype="string")

In [5]: np.asarray(x)
Out[5]: array(['abc', 'defghi'], dtype=object)

In [6]: np.asarray(x, dtype=str)
Out[6]: array(['abc', 'defghi'], dtype='<U6')

In [7]: y = pa.array(["abc", "defghi"])

In [8]: np.asarray(y)
Out[8]: array(['abc', 'defghi'], dtype=object)

In [9]: np.asarray(y, dtype=str)
Out[9]: array(['abc', 'defghi'], dtype='<U6')

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main idea of this PR is to let vectors_to_arrays do the conversion work from any dtypes (including pd.StringDtype and pa.StringArray) into the numpy dtypes, so that the virtualfile_from_vectors only need to care about how to map numpy dtypes into GMT data types.

For any special dtypes that we know how to convert it to numpy dtype, we can maintain a mapping dictionary, just like what you did to support pyarrow's date32[day] and date64[ms] in #2845:

dtypes = {
"date32[day][pyarrow]": np.datetime64,
"date64[ms][pyarrow]": np.datetime64,
}

Copy link
Member Author

@seisman seisman Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In 83673cf, I've moved most of the doctests into a separate test file pygmt/tests/test_clib_vectors_to_arrays.py. A test test_vectors_to_arrays_pandas_string is added to check that the function can handle pd.StringDtype correctly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For any special dtypes that we know how to convert it to numpy dtype, we can maintain a mapping dictionary, just like what you did to support pyarrow's date32[day] and date64[ms] in #2845:

dtypes = {
"date32[day][pyarrow]": np.datetime64,
"date64[ms][pyarrow]": np.datetime64,
}

Based on the tests below, I think we should add the entry "string": np.str_ to the dictionary:

In [1]: import pandas as pd

In [2]: x = pd.Series(["abc", "12345"])

In [3]: x.dtype
Out[3]: dtype('O')

In [4]: str(x.dtype)
Out[4]: 'object'

In [5]: x = pd.Series(["abc", "12345"], dtype="string")

In [6]: x.dtype
Out[6]: string[python]

In [7]: str(x.dtype)
Out[7]: 'string'

In [8]: x = pd.Series(["abc", "12345"], dtype="string[pyarrow]")

In [9]: x.dtype
Out[9]: string[pyarrow]

In [10]: str(x.dtype)
Out[10]: 'string'

In [11]: import pyarrow as pa

In [12]: x = pa.array(["abc", "defghi"])

In [13]: x.type
Out[13]: DataType(string)

In [14]: str(x.type)
Out[14]: 'string'

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In PR #2774 and #2774, we only checked if PyGMT supports pandas with the pyarrow backend, but didn't check if the original pyarrow arrays works. For example, for a pyarrow date32 array, we need to check array.type rather than array.dtype:

In [1]: import datetime

In [2]: import pyarrow as pa

In [3]: x = pa.array([datetime.date(2020, 1, 1), datetime.date(2021, 12, 31)])

In [4]: str(x.type)
Out[4]: 'date32[day]'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, for a pyarrow date32 array, we need to check array.type

Yes, raw pyarrow arrays of date32/date64 type are not supported yet. That's why it's marked 🚧 in #2800 (I was planning to modify array_to_datetime to handle it).

columns = col + 2
break

Expand Down
22 changes: 22 additions & 0 deletions pygmt/tests/test_clib_vectors_to_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd
import pytest
from pygmt.clib.conversion import vectors_to_arrays
from pygmt.helpers.testing import skip_if_no

_HAS_PYARROW = bool(importlib.util.find_spec("pyarrow"))

Expand Down Expand Up @@ -80,6 +81,27 @@ def test_vectors_to_arrays_pandas_nan():
_check_arrays(arrays)


@pytest.mark.parametrize(
"dtype",
[
"string[python]",
pytest.param("string[pyarrow]", marks=skip_if_no(package="pyarrow")),
pytest.param("string[pyarrow_numpy]", marks=skip_if_no(package="pyarrow")),
],
)
def test_vectors_to_arrays_pandas_string(dtype):
"""
Test the vectors_to_arrays function with pandas strings.
"""
vectors = [
pd.Series(["abc", "defg"], dtype=dtype),
pd.Series(["hijklmn", "123456"], dtype=dtype),
]
arrays = vectors_to_arrays(vectors)
assert all(i.dtype.type == np.str_ for i in arrays)
_check_arrays(arrays)


@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed.")
def test_vectors_to_arrays_pyarrow_datetime():
"""
Expand Down