From 56e4b526f239a6894044e0b9e20223b11ee0ce5f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Oct 2024 21:20:44 +0200 Subject: [PATCH] ENH: geography constructor from geoarrow --- CMakeLists.txt | 20 ++++++-- ci/environment-dev.yml | 1 + ci/environment.yml | 1 + src/arrow_abi.h | 57 ++++++++++++++++++++++ src/geoarrow.cpp | 107 +++++++++++++++++++++++++++++++++++++++++ src/spherely.cpp | 12 +++++ tests/test_geoarrow.py | 105 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 298 insertions(+), 5 deletions(-) create mode 100644 src/arrow_abi.h create mode 100644 src/geoarrow.cpp create mode 100644 tests/test_geoarrow.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d59553..8cbc7fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,15 +71,25 @@ endif() # Build -add_library(spherely MODULE +set(CPP_SOURCES src/geography.cpp src/accessors-geog.cpp src/predicates.cpp - src/spherely.cpp - ) + src/spherely.cpp) + +if(${s2geography_VERSION} VERSION_GREATER_EQUAL "0.2.0") + set(CPP_SOURCES ${CPP_SOURCES} src/geoarrow.cpp) +endif() + +add_library(spherely MODULE ${CPP_SOURCES}) -target_compile_definitions(spherely - PRIVATE VERSION_INFO=${PROJECT_VERSION}) +target_compile_definitions( + spherely + PRIVATE + VERSION_INFO=${PROJECT_VERSION} + S2GEOGRAPHY_VERSION=${s2geography_VERSION} + S2GEOGRAPHY_VERSION_MAJOR=${s2geography_VERSION_MAJOR} + S2GEOGRAPHY_VERSION_MINOR=${s2geography_VERSION_MINOR}) target_link_libraries(spherely PRIVATE pybind11::module pybind11::lto pybind11::windows_extras diff --git a/ci/environment-dev.yml b/ci/environment-dev.yml index f00ca15..a22d803 100644 --- a/ci/environment-dev.yml +++ b/ci/environment-dev.yml @@ -13,3 +13,4 @@ dependencies: - ninja - pytest - pip + - geoarrow-pyarrow diff --git a/ci/environment.yml b/ci/environment.yml index 6dca01c..28c8655 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -14,3 +14,4 @@ dependencies: - ninja - pytest - pip + - geoarrow-pyarrow diff --git a/src/arrow_abi.h b/src/arrow_abi.h new file mode 100644 index 0000000..b8b62b8 --- /dev/null +++ b/src/arrow_abi.h @@ -0,0 +1,57 @@ +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Extra guard for versions of Arrow without the canonical guard +#ifndef ARROW_FLAG_DICTIONARY_ORDERED + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE +#endif // ARROW_FLAG_DICTIONARY_ORDERED + +#ifdef __cplusplus +} +#endif diff --git a/src/geoarrow.cpp b/src/geoarrow.cpp new file mode 100644 index 0000000..4a427ac --- /dev/null +++ b/src/geoarrow.cpp @@ -0,0 +1,107 @@ +// include this before s2geography to avoid ArrowArray compile issues +#include + +#include "arrow_abi.h" +#include "geography.hpp" +#include "pybind11.hpp" + +namespace py = pybind11; +namespace s2geog = s2geography; +using namespace spherely; + +py::array_t from_geoarrow(py::object input, + bool oriented, + bool planar, + py::object geometry_encoding) { + py::tuple capsules = input.attr("__arrow_c_array__")(); + py::capsule schema_capsule = capsules[0]; + py::capsule array_capsule = capsules[1]; + + const ArrowSchema* schema = static_cast(schema_capsule); + const ArrowArray* array = static_cast(array_capsule); + + s2geog::geoarrow::Reader reader; + std::vector> s2geog_vec; + + s2geog::geoarrow::ImportOptions options; + options.set_oriented(oriented); + if (planar) { + // TODO replace with constant + auto tol = S1Angle::Radians(100.0 / (6371.01 * 1000)); + options.set_tessellate_tolerance(tol); + } + if (geometry_encoding.is(py::none())) { + reader.Init(schema, options); + } else if (geometry_encoding.equal(py::str("WKT"))) { + reader.Init(s2geog::geoarrow::Reader::InputType::kWKT, options); + } else if (geometry_encoding.equal(py::str("WKB"))) { + reader.Init(s2geog::geoarrow::Reader::InputType::kWKB, options); + } else { + throw std::invalid_argument("'geometry_encoding' should be one of None, 'WKT' or 'WKB'"); + } + reader.ReadGeography(array, 0, array->length, &s2geog_vec); + + // Convert resulting vector to array of python objects + auto result = py::array_t(array->length); + py::buffer_info rbuf = result.request(); + py::object* rptr = static_cast(rbuf.ptr); + + py::ssize_t i = 0; + for (auto& s2geog_ptr : s2geog_vec) { + auto geog_ptr = std::make_unique(std::move(s2geog_ptr)); + // return PyObjectGeography::from_geog(std::move(geog_ptr)); + rptr[i] = py::cast(std::move(geog_ptr)); + i++; + } + return result; +} + +void init_geoarrow(py::module& m) { + m.def("from_geoarrow", + &from_geoarrow, + py::arg("input"), + py::pos_only(), + py::kw_only(), + py::arg("oriented") = false, + py::arg("planar") = false, + py::arg("geometry_encoding") = py::none(), + R"pbdoc( + Create an array of geographies from an Arrow array object with a GeoArrow + extension type. + + See https://geoarrow.org/ for details on the GeoArrow specification. + + This functions accepts any Arrow array object implementing + the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__`` + method). + + .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html + + Parameters + ---------- + input : pyarrow.Array, Arrow array + Any array object implementing the Arrow PyCapsule Protocol + (i.e. has a ``__arrow_c_array__`` method). The type of the array + should be one of the geoarrow geometry types. + oriented : bool, default False + Set to True if polygon ring directions are known to be correct + (i.e., exterior rings are defined counter clockwise and interior + rings are defined clockwise). + By default (False), it will return the polygon with the smaller + area. + planar : bool, default False + If set to True, the edges linestrings and polygons are assumed to + be planar. In that case, additional points will be added to the line + while creating the geography objects, to ensure every point is + within 100m of the original line. + By default (False), it is assumed that the edges are spherical + (i.e. represent the shortest path on the sphere between two points). + geometry_encoding : str, default None + By default, the encoding is inferred from the GeoArrow extension + type of the input array. + However, for parsing WKT and WKB it is also possible to pass an + Arrow array without geoarrow type but with a plain string or + binary type, if specifying this keyword with "WKT" or "WKB", + respectively. + )pbdoc"); +} diff --git a/src/spherely.cpp b/src/spherely.cpp index df1d957..a63ea04 100644 --- a/src/spherely.cpp +++ b/src/spherely.cpp @@ -8,6 +8,10 @@ namespace py = pybind11; void init_geography(py::module&); void init_predicates(py::module&); void init_accessors(py::module&); +#if defined(S2GEOGRAPHY_VERSION_MAJOR) && \ + (S2GEOGRAPHY_VERSION_MAJOR >= 1 || S2GEOGRAPHY_VERSION_MINOR >= 2) +void init_geoarrow(py::module&); +#endif PYBIND11_MODULE(spherely, m) { m.doc() = R"pbdoc( @@ -21,10 +25,18 @@ PYBIND11_MODULE(spherely, m) { init_geography(m); init_predicates(m); init_accessors(m); +#if defined(S2GEOGRAPHY_VERSION_MAJOR) && \ + (S2GEOGRAPHY_VERSION_MAJOR >= 1 || S2GEOGRAPHY_VERSION_MINOR >= 2) + init_geoarrow(m); +#endif #ifdef VERSION_INFO m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO); #else m.attr("__version__") = "dev"; #endif + +#ifdef S2GEOGRAPHY_VERSION + m.attr("__s2geography_version__") = MACRO_STRINGIFY(S2GEOGRAPHY_VERSION); +#endif } diff --git a/tests/test_geoarrow.py b/tests/test_geoarrow.py new file mode 100644 index 0000000..c7cbdce --- /dev/null +++ b/tests/test_geoarrow.py @@ -0,0 +1,105 @@ +from packaging.version import Version + +import pyarrow as pa +import geoarrow.pyarrow as ga + +import pytest + +import spherely + + +pytestmark = pytest.mark.skipif( + Version(spherely.__s2geography_version__) < Version("0.2.0"), + reason="Needs s2geography >= 0.2.0", +) + + +def test_from_geoarrow_wkt(): + + arr = ga.as_wkt(["POINT (1 1)", "POINT(2 2)", "POINT(3 3)"]) + + result = spherely.from_geoarrow(arr) + expected = spherely.create([1, 2, 3], [1, 2, 3]) + # object equality does not yet work + # np.testing.assert_array_equal(result, expected) + assert spherely.equals(result, expected).all() + + # without extension type + arr = pa.array(["POINT (1 1)", "POINT(2 2)", "POINT(3 3)"]) + result = spherely.from_geoarrow(arr, geometry_encoding="WKT") + assert spherely.equals(result, expected).all() + + +def test_from_geoarrow_wkb(): + + arr = ga.as_wkt(["POINT (1 1)", "POINT(2 2)", "POINT(3 3)"]) + arr_wkb = ga.as_wkb(arr) + + result = spherely.from_geoarrow(arr_wkb) + expected = spherely.create([1, 2, 3], [1, 2, 3]) + assert spherely.equals(result, expected).all() + + # without extension type + arr_wkb = ga.as_wkb(["POINT (1 1)", "POINT(2 2)", "POINT(3 3)"]) + arr = arr_wkb.cast(pa.binary()) + result = spherely.from_geoarrow(arr, geometry_encoding="WKB") + assert spherely.equals(result, expected).all() + + +def test_from_geoarrow_native(): + + arr = ga.as_wkt(["POINT (1 1)", "POINT(2 2)", "POINT(3 3)"]) + arr_point = ga.as_geoarrow(arr) + + result = spherely.from_geoarrow(arr_point) + expected = spherely.create([1, 2, 3], [1, 2, 3]) + assert spherely.equals(result, expected).all() + + +polygon_with_bad_hole_wkt = ( + "POLYGON " + "((20 35, 10 30, 10 10, 30 5, 45 20, 20 35)," + "(30 20, 20 25, 20 15, 30 20))" +) + + +# @pytest.mark.skipif( +# Version(spherely.__s2geography_version__) < Version("0.2.0"), +# reason="Needs s2geography >= 0.2.0", +# ) +def test_from_geoarrow_oriented(): + # by default re-orients the inner ring + arr = ga.as_geoarrow([polygon_with_bad_hole_wkt]) + + result = spherely.from_geoarrow(arr) + assert ( + str(result[0]) + == "POLYGON ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (20 15, 20 25, 30 20, 20 15))" + ) + + # if we force to not orient, we get an error + with pytest.raises(RuntimeError, match="Inconsistent loop orientations detected"): + spherely.from_geoarrow(arr, oriented=True) + + +def test_from_geoarrow_planar(): + arr = ga.as_geoarrow(["LINESTRING (-64 45, 0 45)"]) + result = spherely.from_geoarrow(arr) + assert spherely.distance(result[0], spherely.Point(45, -30)) > 10000 + + result = spherely.from_geoarrow(arr, planar=True) + assert spherely.distance(result[0], spherely.Point(45, -30)) < 100 + + +def test_from_geoarrow_no_extension_type(): + arr = pa.array(["POINT (1 1)", "POINT(2 2)", "POINT(3 3)"]) + + with pytest.raises(RuntimeError, match="Expected extension type"): + spherely.from_geoarrow(arr) + + +def test_from_geoarrow_invalid_encoding(): + arr = pa.array(["POINT (1 1)", "POINT(2 2)", "POINT(3 3)"]) + + with pytest.raises(ValueError, match="'geometry_encoding' should be one"): + spherely.from_geoarrow(arr, geometry_encoding="point")