From 9ebd7b106ba2ae30046cb4f3e9561a8bbdb18b9b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 9 Oct 2024 18:54:44 +0200 Subject: [PATCH] refactor: Reuse GeoArrowArrayReader in ReaderImpl (#29) I am not entirely sure this is desired (it gives a bit of overhead), but while trying to understand the reader/writer/builder/visitor hierarchy in geoarrow-c, I noticed that here we were not using `GeoArrowArrayReader`, but the separate readers for WKT/WKB (`GeoArrowWKTReader`/`GeoArrowWKBReader`) and `GeoArrowArrayViewVisit` for native coords, duplicating the iteration function for WKT/WKB. While the `GeoArrowArrayReader` already abstracts away those three options behind a single interface. This gives a bit more overhead for the scalar `WKTReader` (I assume because this constructs a `geoarrow::Reader` every time, and this now not just initializes the `GeoArrowWKTReader`, but also the other structs for WKB and native). Given this is the scalar version (and you can always ensure to start from arrow input for optimal performance), I think that overhead is acceptable. But alternatively, we could also update `GeoArrowArrayReader` to only initialize the WKT/WKB readers if needed (when it gets data passed with that type), instead of upfront. Or another option is to expose the `GeoArrowArrayViewVisitWKT/WKB` functions in geoarrow.h, so that we can at least avoid reimplementing those. Also added some brief doc comments. --- src/s2geography/geoarrow.cc | 94 +++++-------------------------------- src/s2geography/geoarrow.h | 4 ++ 2 files changed, 16 insertions(+), 82 deletions(-) diff --git a/src/s2geography/geoarrow.cc b/src/s2geography/geoarrow.cc index 72c4859..cf77573 100644 --- a/src/s2geography/geoarrow.cc +++ b/src/s2geography/geoarrow.cc @@ -61,6 +61,12 @@ static inline int8_t GeoArrowBitmapReaderNextIsNull( return (bitmap_reader->byte & (1 << bitmap_reader->bit_i)) == 0; } +/// \brief Construct Geography objects while visiting a GeoArrow array. +/// +/// This class implements the visitor callbacks that construct Geography +/// objects. This visitor gets called by the Reader, which iterates over the +/// features in a GeoArrow ArrowArray (that can be either in WKT or WKB +/// serialized format or in native format). class Constructor { public: Constructor(const ImportOptions& options) : options_(options) { @@ -577,17 +583,12 @@ class ReaderImpl { public: ReaderImpl() { error_.message[0] = '\0'; - wkt_reader_.private_data = nullptr; - wkb_reader_.private_data = nullptr; + reader_.private_data = nullptr; } ~ReaderImpl() { - if (wkt_reader_.private_data != nullptr) { - GeoArrowWKTReaderReset(&wkt_reader_); - } - - if (wkb_reader_.private_data != nullptr) { - GeoArrowWKBReaderReset(&wkb_reader_); + if (reader_.private_data != nullptr) { + GeoArrowArrayReaderReset(&reader_); } } @@ -612,13 +613,7 @@ class ReaderImpl { constructor_->InitVisitor(&visitor_); visitor_.error = &error_; - if (array_view_.schema_view.type == GEOARROW_TYPE_WKT) { - GeoArrowWKTReaderInit(&wkt_reader_); - } - - if (array_view_.schema_view.type == GEOARROW_TYPE_WKB) { - GeoArrowWKBReaderInit(&wkb_reader_); - } + GeoArrowArrayReaderInit(&reader_); } void ReadGeography(const ArrowArray* array, int64_t offset, int64_t length, @@ -631,19 +626,7 @@ class ReaderImpl { } constructor_->SetOutput(out); - - switch (array_view_.schema_view.type) { - case GEOARROW_TYPE_WKT: - code = VisitWKT(offset, length); - break; - case GEOARROW_TYPE_WKB: - code = VisitWKB(offset, length); - break; - default: - code = GeoArrowArrayViewVisit(&array_view_, offset, length, &visitor_); - break; - } - + code = GeoArrowArrayReaderVisit(&reader_, &array_view_, offset, length, &visitor_); ThrowNotOk(code); } @@ -651,63 +634,10 @@ class ReaderImpl { ImportOptions options_; std::unique_ptr constructor_; GeoArrowArrayView array_view_; - GeoArrowWKTReader wkt_reader_; - GeoArrowWKBReader wkb_reader_; + GeoArrowArrayReader reader_; GeoArrowVisitor visitor_; GeoArrowError error_; - int VisitWKT(int64_t offset, int64_t length) { - offset += array_view_.offset[0]; - const uint8_t* validity = array_view_.validity_bitmap; - const int32_t* offsets = array_view_.offsets[0]; - const char* data = reinterpret_cast(array_view_.data); - GeoArrowStringView item; - - GeoArrowBitmapReader bitmap; - GeoArrowBitmapReaderInit(&bitmap, validity, offset); - - for (int64_t i = 0; i < length; i++) { - if (GeoArrowBitmapReaderNextIsNull(&bitmap)) { - GEOARROW_RETURN_NOT_OK(visitor_.feat_start(&visitor_)); - GEOARROW_RETURN_NOT_OK(visitor_.null_feat(&visitor_)); - GEOARROW_RETURN_NOT_OK(visitor_.feat_end(&visitor_)); - } else { - item.size_bytes = offsets[offset + i + 1] - offsets[offset + i]; - item.data = data + offsets[offset + i]; - GEOARROW_RETURN_NOT_OK( - GeoArrowWKTReaderVisit(&wkt_reader_, item, &visitor_)); - } - } - - return GEOARROW_OK; - } - - int VisitWKB(int64_t offset, int64_t length) { - offset += array_view_.offset[0]; - const uint8_t* validity = array_view_.validity_bitmap; - const int32_t* offsets = array_view_.offsets[0]; - const uint8_t* data = array_view_.data; - GeoArrowBufferView item; - - GeoArrowBitmapReader bitmap; - GeoArrowBitmapReaderInit(&bitmap, validity, offset); - - for (int64_t i = 0; i < length; i++) { - if (GeoArrowBitmapReaderNextIsNull(&bitmap)) { - GEOARROW_RETURN_NOT_OK(visitor_.feat_start(&visitor_)); - GEOARROW_RETURN_NOT_OK(visitor_.null_feat(&visitor_)); - GEOARROW_RETURN_NOT_OK(visitor_.feat_end(&visitor_)); - } else { - item.size_bytes = offsets[offset + i + 1] - offsets[offset + i]; - item.data = data + offsets[offset + i]; - GEOARROW_RETURN_NOT_OK( - GeoArrowWKBReaderVisit(&wkb_reader_, item, &visitor_)); - } - } - - return GEOARROW_OK; - } - void ThrowNotOk(int code) { if (code != GEOARROW_OK) { throw Exception(error_.message); diff --git a/src/s2geography/geoarrow.h b/src/s2geography/geoarrow.h index cef0010..be708d7 100644 --- a/src/s2geography/geoarrow.h +++ b/src/s2geography/geoarrow.h @@ -46,6 +46,10 @@ class ImportOptions { class ReaderImpl; +/// \brief Array reader for any GeoArrow extension array +/// +/// This class is used to convert an ArrowArray with geoarrow data (serialized +/// or native) into a vector of Geography objects. class Reader { public: enum class InputType { kWKT, kWKB };