Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: geoarrow export #30

Merged
merged 15 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 164 additions & 1 deletion src/s2geography/geoarrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,169 @@ void Reader::ReadGeography(const ArrowArray* array, int64_t offset,
impl_->ReadGeography(array, offset, length, out);
}

} // namespace geoarrow
class WriterImpl {
public:
WriterImpl() {
error_.message[0] = '\0';
writer_.private_data = nullptr;
}

~WriterImpl() {
if (writer_.private_data != nullptr) {
GeoArrowArrayWriterReset(&writer_);
}
}

void Init(const ArrowSchema* schema, const ImportOptions& options) {
options_ = options;

int code = GeoArrowArrayWriterInitFromSchema(&writer_, schema);
ThrowNotOk(code);

InitCommon();
}

void Init(GeoArrowType type, const ImportOptions& options,
struct ArrowSchema* out_schema) {
options_ = options;

int code = GeoArrowArrayWriterInitFromType(&writer_, type);
ThrowNotOk(code);
code = GeoArrowSchemaInitExtension(out_schema, type);
ThrowNotOk(code);

InitCommon();
}

void InitCommon() {
visitor_.error = &error_;
int code = GeoArrowArrayWriterInitVisitor(&writer_, &visitor_);
ThrowNotOk(code);
}

void WriteGeography(const Geography** geographies, size_t geographies_size,
struct ArrowArray* out) {
for (size_t i = 0; i < geographies_size; i++) {
VisitFeature(*geographies[i]);
}
int code = GeoArrowArrayWriterFinish(&writer_, out, &error_);
ThrowNotOk(code);
}
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

private:
ImportOptions options_;
GeoArrowArrayWriter writer_;
GeoArrowVisitor visitor_;
GeoArrowCoordView coords_view_;
GeoArrowError error_;

int VisitPoints(const PointGeography& point) {
coords_view_.n_coords = 1;
coords_view_.n_values = 2;
coords_view_.coords_stride = 2;
double coords[2];

if (point.Points().size() == 0) {
GEOARROW_RETURN_NOT_OK(visitor_.geom_start(
&visitor_, GEOARROW_GEOMETRY_TYPE_POINT, GEOARROW_DIMENSIONS_XY));
GEOARROW_RETURN_NOT_OK(visitor_.geom_end(&visitor_));
} else if (point.Points().size() == 1) {
GEOARROW_RETURN_NOT_OK(visitor_.geom_start(
&visitor_, GEOARROW_GEOMETRY_TYPE_POINT, GEOARROW_DIMENSIONS_XY));
S2LatLng ll(point.Points()[0]);
coords[0] = ll.lng().degrees();
coords[1] = ll.lat().degrees();
coords_view_.values[0] = &coords[0];
coords_view_.values[1] = &coords[1];
// coords_view_.values = static_cast<const double*[4]>(coords);
// GEOARROW_COORD_VIEW_VALUE(&coords_view_, 0, 0) = ll.lng().degrees();
// GEOARROW_COORD_VIEW_VALUE(&coords_view_, 0, 1) = ll.lng().degrees();
GEOARROW_RETURN_NOT_OK(visitor_.coords(&visitor_, &coords_view_));
GEOARROW_RETURN_NOT_OK(visitor_.geom_end(&visitor_));
}
// TODO MultiPoint
// } else {
// handler->new_geometry_type(util::GeometryType::MULTIPOINT);
// HANDLE_OR_RETURN(handler->geom_start(util::GeometryType::MULTIPOINT,
// geog.Points().size()));

// for (const S2Point& pt : geog.Points()) {
// handler->geom_start(util::GeometryType::POINT, 1);
// S2LatLng ll(pt);
// coords[0] = ll.lng().degrees();
// coords[1] = ll.lat().degrees();
// HANDLE_OR_RETURN(handler->coords(coords, 1, 2));
// HANDLE_OR_RETURN(handler->geom_end());
// }

// handler->geom_end();
// }
return GEOARROW_OK;
}

int VisitFeature(const Geography& geog) {
GEOARROW_RETURN_NOT_OK(visitor_.feat_start(&visitor_));

auto child_point = dynamic_cast<const PointGeography*>(&geog);
if (child_point != nullptr) {
GEOARROW_RETURN_NOT_OK(VisitPoints(*child_point));
} else {
throw Exception("Unsupported Geography subclass");
// auto child_polyline = dynamic_cast<const PolylineGeography*>(&geog);
// if (child_polyline != nullptr) {
// HANDLE_OR_RETURN(handle_polylines(*child_polyline, handler));
// } else {
// auto child_polygon = dynamic_cast<const PolygonGeography*>(&geog);
// if (child_polygon != nullptr) {
// HANDLE_OR_RETURN(handle_polygon(*child_polygon, handler));
// } else {
// auto child_collection = dynamic_cast<const
// GeographyCollection*>(&geog); if (child_collection != nullptr) {
// HANDLE_OR_RETURN(handle_collection(*child_collection, handler));
// } else {
// throw Exception("Unsupported Geography subclass");
// }
// }
// }
}
return GEOARROW_OK;
}

void ThrowNotOk(int code) {
if (code != GEOARROW_OK) {
throw Exception(error_.message);
}
}
};

Writer::Writer() : impl_(new WriterImpl()) {}

Writer::~Writer() { impl_.reset(); }

void Writer::Init(const ArrowSchema* schema, const ImportOptions& options) {
impl_->Init(schema, options);
}

void Writer::Init(OutputType output_type, const ImportOptions& options,
struct ArrowSchema* out_schema) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case of initializing the writer with just a type, I added an ArrowSchema as output argument, because for exporting the array to another producer, we need both the array and schema.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put this above, too, but I think you want a const ArrowSchema* as your entry point into the Writer (you can expose a function to generate the appropriate ArrowSchema* so that those APIs are decoupled).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, if we have a separate function to construct this schema from a type / encoding (or those inferred from an array of gemetries), then it is perfectly fine to only have the Init here starting from a schema

switch (output_type) {
case OutputType::kPoints:
impl_->Init(GEOARROW_TYPE_INTERLEAVED_POINT, options, out_schema);
break;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just added kPoints (mapping to interleaved points) for a quick test. I assume we might want to add some generic "native geoarrow" option like kGeoArrow(Interleaved), and then infer the schema from the geographies? (with something similar like GeoArrowGEOSSchemaCalculator you did in geoarrow-c-geos)

It might still be worth allowing the user to specify the type themselves to avoid inference / choose a specific type (e.g. always the multi-version regardless of presence of any multi geom)

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

geoarrow-c-geos also has a function to help you make a schema:

https://github.com/geoarrow/geoarrow-c-geos/blob/33ad0ba21c76c09e9d72fc4e4ae0b9ff9da61848/src/geoarrow_geos/geoarrow_geos.h#L122-L123

The right entry point I think is still ArrowSchema* (until a point where geoarrow-c exposes an ABI for specifying geometry type constraints, which is I think a ways off). It's definitely good to let the user specify this (often they already know because they're calculating a centroid or something).

The calculator is a definite must! It can probably use geography->num_shape() and geography->dimension():

virtual int dimension() const {
if (num_shapes() == 0) {
return -1;
}
int dim = Shape(0)->dimension();
for (int i = 1; i < num_shapes(); i++) {
if (dim != Shape(i)->dimension()) {
return -1;
}
}
return dim;
}
// The number of S2Shape objects needed to represent this Geography
virtual int num_shapes() const = 0;

case OutputType::kWKT:
impl_->Init(GEOARROW_TYPE_WKT, options, out_schema);
break;
case OutputType::kWKB:
impl_->Init(GEOARROW_TYPE_WKB, options, out_schema);
break;
default:
throw Exception("Output type not supported");
}
}

void Writer::WriteGeography(const Geography** geographies,
size_t geographies_size, struct ArrowArray* out) {
impl_->WriteGeography(geographies, geographies_size, out);
}

} // namespace s2geography
29 changes: 29 additions & 0 deletions src/s2geography/geoarrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,35 @@ class Reader {
std::unique_ptr<ReaderImpl> impl_;
};

class WriterImpl;

/// \brief Array writer for any GeoArrow extension array
///
/// This class is used to convert a vector of Geography objects into an ArrowArray
/// with geoarrow data (serialized or native).
class Writer {
public:
enum class OutputType { kPoints, kWKT, kWKB };
Writer();
~Writer();

void Init(const ArrowSchema* schema) { Init(schema, ImportOptions()); }

void Init(const ArrowSchema* schema, const ImportOptions& options);

void Init(OutputType output_type, struct ArrowSchema* out_schema) { Init(output_type, ImportOptions(), out_schema); }

void Init(OutputType output_type, const ImportOptions& options, struct ArrowSchema* out_schema);

// void WriteGeography(const Geography geographies, struct ArrowArray* out);

void WriteGeography(const Geography** geographies, size_t geographies_size,
struct ArrowArray* out);

private:
std::unique_ptr<WriterImpl> impl_;
};

} // namespace geoarrow

} // namespace s2geography
Loading