From e18295300ce91d7c972134cc71c53c66b8476c43 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 9 Oct 2024 10:41:21 +0200 Subject: [PATCH] ENH: export to geoarrow output --- CMakeLists.txt | 8 +- src/geoarrow.cpp | 182 +++++++++++++++++++++++++++++++++++++++++ tests/test_geoarrow.py | 27 ++++++ 3 files changed, 213 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8cbc7fd..b377f93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,10 +97,10 @@ target_link_libraries(spherely ) pybind11_extension(spherely) -if(NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo) - # Strip unnecessary sections of the binary on Linux/macOS - pybind11_strip(spherely) -endif() +# if(NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo) +# # Strip unnecessary sections of the binary on Linux/macOS +# pybind11_strip(spherely) +# endif() set_target_properties(spherely PROPERTIES CXX_VISIBILITY_PRESET "hidden") diff --git a/src/geoarrow.cpp b/src/geoarrow.cpp index 4a427ac..a0b1723 100644 --- a/src/geoarrow.cpp +++ b/src/geoarrow.cpp @@ -56,7 +56,167 @@ py::array_t from_geoarrow(py::object input, return result; } +/// \brief Object holding (and managing the memory) of an Arrow array (ArrowArray and ArraySchema +/// combo) +class ArrowArrayHolder { +public: + /// \brief Construct an invalid instance holding no resources + ArrowArrayHolder() { + array_.release = nullptr; + schema_.release = nullptr; + } + + /// \brief Move and take ownership of data + ArrowArrayHolder(ArrowArray* array, ArrowSchema* schema) { + move(array, schema, &array_, &schema_); + } + + /// \brief Move and take ownership of data wrapped by rhs + ArrowArrayHolder(ArrowArrayHolder&& rhs) : ArrowArrayHolder(rhs.array(), rhs.schema()) {} + ArrowArrayHolder& operator=(ArrowArrayHolder&& rhs) { + reset(rhs.array(), rhs.schema()); + return *this; + } + + // These objects are not copyable + ArrowArrayHolder(const ArrowArrayHolder& rhs) = delete; + + /// \brief Get a pointer to the data owned by this object + ArrowArray* array() noexcept { + return &array_; + } + const ArrowArray* array() const noexcept { + return &array_; + } + + ArrowSchema* schema() noexcept { + return &schema_; + } + const ArrowSchema* schema() const noexcept { + return &schema_; + } + + py::tuple return_capsules(py::args args, const py::kwargs& kwargs) { + if ((args.size() > 0) && (!args[0].is_none())) { + throw std::invalid_argument( + "Additional arguments (such as requested_schema) with a non-default value are not " + "supported"); + } + if (kwargs) { + for (auto& item : kwargs) { + if (!item.second.is_none()) { + throw std::invalid_argument( + "Additional arguments (such as requested_schema) with a non-default value " + "are not supported"); + } + } + } + + ArrowArray* c_array = static_cast(malloc(sizeof(ArrowArray))); + ArrowSchema* c_schema = static_cast(malloc(sizeof(ArrowSchema))); + move(&array_, &schema_, c_array, c_schema); + + constexpr auto array_cleanup = [](void* ptr) noexcept { + auto array = static_cast(ptr); + if (array->release != nullptr) { + array->release(array); + } + free(array); + }; + py::capsule array_capsule{c_array, "arrow_array", array_cleanup}; + + constexpr auto schema_cleanup = [](void* ptr) noexcept { + auto schema = static_cast(ptr); + if (schema->release != nullptr) { + schema->release(schema); + } + free(schema); + }; + py::capsule schema_capsule{c_schema, "arrow_schema", schema_cleanup}; + + return py::make_tuple(schema_capsule, array_capsule); + } + + void move(ArrowArray* array_src, + ArrowSchema* schema_src, + ArrowArray* array_dst, + ArrowSchema* schema_dst) { + memcpy(array_dst, array_src, sizeof(struct ArrowArray)); + array_src->release = nullptr; + + memcpy(schema_dst, schema_src, sizeof(struct ArrowSchema)); + schema_src->release = nullptr; + } + + /// \brief Call data's release callback if valid + void reset() { + if (array_.release != nullptr) { + array_.release(&array_); + } + + if (schema_.release != nullptr) { + schema_.release(&schema_); + } + } + + /// \brief Call data's release callback if valid and move ownership of the data + /// pointed to by data + void reset(ArrowArray* array_src, ArrowSchema* schema_src) { + reset(); + move(array_src, schema_src, &array_, &schema_); + } + + ~ArrowArrayHolder() { + reset(); + } + +protected: + ArrowArray array_; + ArrowSchema schema_; +}; + +ArrowArrayHolder to_geoarrow(py::array_t input, py::object geometry_encoding) { + ArrowArrayHolder array = ArrowArrayHolder(); + + s2geog::geoarrow::Writer writer; + std::vector> s2geog_vec; + + if (geometry_encoding.is(py::none())) { + // writer.Init(schema, options); + throw std::invalid_argument("not yet implemented"); + } else if (geometry_encoding.equal(py::str("points"))) { + writer.Init(s2geog::geoarrow::Writer::OutputType::kPoints, array.schema()); + } else if (geometry_encoding.equal(py::str("WKT"))) { + writer.Init(s2geog::geoarrow::Writer::OutputType::kWKT, array.schema()); + } else if (geometry_encoding.equal(py::str("WKB"))) { + writer.Init(s2geog::geoarrow::Writer::OutputType::kWKB, array.schema()); + } else { + throw std::invalid_argument("'geometry_encoding' should be one of None, 'WKT' or 'WKB'"); + } + + size_t num_geographies = static_cast(input.size()); + + const s2geog::Geography** geographies = static_cast( + malloc(sizeof(const s2geog::Geography*) * num_geographies)); + + for (int i = 0; i < input.size(); i++) { + geographies[i] = &((*input.data(i)).as_geog_ptr()->geog()); + } + + // for (int i = 0; i < input.size(); i++) { + // auto geog_ptr = (*input.data(i)).as_geog_ptr(); + // s2geog_vec.push_back(std::make_unique(geog_ptr->geog())); + // } + + writer.WriteGeography(geographies, num_geographies, array.array()); + + return std::move(array); +} + void init_geoarrow(py::module& m) { + py::class_(m, "ArrowArrayHolder") + .def("__arrow_c_array__", &ArrowArrayHolder::return_capsules); + m.def("from_geoarrow", &from_geoarrow, py::arg("input"), @@ -104,4 +264,26 @@ void init_geoarrow(py::module& m) { binary type, if specifying this keyword with "WKT" or "WKB", respectively. )pbdoc"); + + m.def("to_geoarrow", + &to_geoarrow, + py::arg("input"), + py::pos_only(), + py::kw_only(), + py::arg("geometry_encoding") = py::none(), + R"pbdoc( + Convert an array of geographies to an Arrow array object with a GeoArrow + extension type. + + See https://geoarrow.org/ for details on the GeoArrow specification. + + Parameters + ---------- + input : array_like + An array of geography objects. + geometry_encoding : str, default None + By default, the encoding is inferred from the GeoArrow extension + type of the input array. + However, for serializing to WKT and WKB it is also possible to pass + )pbdoc"); } diff --git a/tests/test_geoarrow.py b/tests/test_geoarrow.py index c7cbdce..d9a3bb2 100644 --- a/tests/test_geoarrow.py +++ b/tests/test_geoarrow.py @@ -1,5 +1,6 @@ from packaging.version import Version +import numpy as np import pyarrow as pa import geoarrow.pyarrow as ga @@ -103,3 +104,29 @@ def test_from_geoarrow_invalid_encoding(): with pytest.raises(ValueError, match="'geometry_encoding' should be one"): spherely.from_geoarrow(arr, geometry_encoding="point") + + +def test_to_geoarrow(): + arr = spherely.create([1, 2, 3], [1, 2, 3]) + res = spherely.to_geoarrow(arr, geometry_encoding="points") + assert isinstance(res, spherely.ArrowArrayHolder) + assert hasattr(res, "__arrow_c_array__") + + arr_pa = pa.array(res) + coords = np.asarray(arr_pa.storage.values) + expected = np.array([1, 1, 2, 2, 3, 3], dtype="float64") + np.testing.assert_allclose(coords, expected) + + +def test_to_geoarrow_wkt(): + arr = spherely.create([1, 2, 3], [1, 2, 3]) + result = pa.array(spherely.to_geoarrow(arr, geometry_encoding="WKT")) + # TODO assert result + print(result) + + +def test_to_geoarrow_wkb(): + arr = spherely.create([1, 2, 3], [1, 2, 3]) + result = pa.array(spherely.to_geoarrow(arr, geometry_encoding="WKB")) + # TODO assert result + print(result)