Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ORC-1717: Add geometry type #18

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions specification/ORCv2.md
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ message Type {
VARCHAR = 16;
CHAR = 17;
TIMESTAMP_INSTANT = 18;
GEOMETRY = 19;
}
// the kind of this type
required Kind kind = 1;
Expand All @@ -273,6 +274,91 @@ message Type {
// the precision and scale for decimal
optional uint32 precision = 5;
optional uint32 scale = 6;
repeated StringPair attributes = 7;
// the attributes associated with the geometry type
optional GeometryType geometry = 8;
}
```

#### Geometry Type

Geometry type requires additional information as described in the GeometryType
message below. These attributes limit the scope of geospatial features that
we can support for now.

```
message GeometryType {
enum GeometryEncoding {
// Well-known binary (WKB) representations of geometries.
//
// To be clear, we follow the same rule of WKB and coordinate axis order
// from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM
// and the standard geometry types (Point, LineString, Polygon, MultiPoint,
// MultiLineString, MultiPolygon, and GeometryCollection).
//
// [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92
// [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155
WKB = 0;
}
required GeospatialEncoding encoding = 1;

// Interpretation for edges of non-point geometry objects, i.e. whether the
// edge between points represent a straight cartesian line or the shortest
// line on the sphere.
enum Edges {
PLANAR = 0;
SPHERICAL = 1;
}
required Edges edges = 2;

// Coordinate Reference System, i.e. mapping of how coordinates refer to
// precise locations on earth. Writers are not required to set this field.
// Once crs is set, crs_encoding field below MUST be set together.
// For example, "OGC:CRS84" can be set in the form of PROJJSON as below:
// {
// "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json",
// "type": "GeographicCRS",
// "name": "WGS 84 longitude-latitude",
// "datum": {
// "type": "GeodeticReferenceFrame",
// "name": "World Geodetic System 1984",
// "ellipsoid": {
// "name": "WGS 84",
// "semi_major_axis": 6378137,
// "inverse_flattening": 298.257223563
// }
// },
// "coordinate_system": {
// "subtype": "ellipsoidal",
// "axis": [
// {
// "name": "Geodetic longitude",
// "abbreviation": "Lon",
// "direction": "east",
// "unit": "degree"
// },
// {
// "name": "Geodetic latitude",
// "abbreviation": "Lat",
// "direction": "north",
// "unit": "degree"
// }
// ]
// },
// "id": {
// "authority": "OGC",
// "code": "CRS84"
// }
// }
//
optional string crs = 3;
// Encoding used in the above crs field. It MUST be set if crs field is set.
// Currently the only allowed value is "PROJJSON".
optional string crs_encoding = 4;

// Additional informative metadata about the geometry type.
// Recommended to write a JSON-encoded UTF-8 string.
optional string metadata = 5;
}
```

Expand Down Expand Up @@ -301,6 +387,9 @@ message ColumnStatistics {
optional BinaryStatistics binaryStatistics = 8;
optional TimestampStatistics timestampStatistics = 9;
optional bool hasNull = 10;
optional uint64 bytes_on_disk = 11;
optional CollectionStatistics collection_statistics = 12;
optional GeometryStatistics geometry_statistics = 13;
}
```

Expand Down Expand Up @@ -395,6 +484,75 @@ message BinaryStatistics {
}
```

Geometry columns store optional bounding boxes, coverings and list of
geometry type codes from all values.

```
// Bounding box of geometries in the representation of min/max value pair of
// coordinates from each axis. Values of Z and M are omitted for 2D geometries.
// Filter pushdown on geometries are only safe for planar spatial predicate
// but it is recommended that the writer always generates bounding boxes
// regardless of whether the geometries are planar or spherical.
message BoundingBox {
required double xmin = 1;
required double xmax = 2;
required double ymin = 3;
required double ymax = 4;
optional double zmin = 5;
optional double zmax = 6;
optional double mmin = 7;
optional double mmax = 8;
}

// A custom binary-encoded polygon or multi-polygon to represent a covering of
// geometries. For example, it may be a bounding box or an envelope when a
// bounding box cannot be built (e.g., a geometry has spherical edges, or if
// an edge of geographic coordinates crosses the antimeridian). In addition,
// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids.
message Covering {
// A type of covering. Currently accepted values: "WKB".
optional string kind = 1;
// A payload specific to the kind. Below are the supported values:
// - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely
// covers the contents. This will be interpreted according to the same CRS
// and edges defined by the logical type.
optional bytes value = 2;
}

message GeometryStatistics {
// The bounding box of geometries in the column.
optional BoundingBox bbox = 1;
// List of coverings of geometries in the column.
repeated Covering coverings = 2;
// The geometry types of all geometries, or an empty array if they are not
// known. This is borrowed from `geometry_types` column metadata of GeoParquet [1]
// except that values in the list are WKB (ISO variant) integer codes [2]. Table
// below shows the most common geometry types and their codes:
//
// | Type | XY | XYZ | XYM | XYZM |
// | :----------------- | :--- | :--- | :--- | :--: |
// | Point | 0001 | 1001 | 2001 | 3001 |
// | LineString | 0002 | 1002 | 2002 | 3002 |
// | Polygon | 0003 | 1003 | 2003 | 3003 |
// | MultiPoint | 0004 | 1004 | 2004 | 3004 |
// | MultiLineString | 0005 | 1005 | 2005 | 3005 |
// | MultiPolygon | 0006 | 1006 | 2006 | 3006 |
// | GeometryCollection | 0007 | 1007 | 2007 | 3007 |
//
// In addition, the following rules are used:
// - A list of multiple values indicates that multiple geometry types are
// present (e.g. `[0003, 0006]`).
// - An empty array explicitly signals that the geometry types are not known.
// - The geometry types in the list must be unique (e.g. `[0001, 0001]`
// is not valid).
//
// Please refer to links below for more detail:
// [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary
// [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159
repeated int32 geometry_types = 3;
}
```

### User Metadata

The user can add arbitrary key/value pairs to an ORC file as it is
Expand Down Expand Up @@ -1233,6 +1391,21 @@ Encoding | Stream Kind | Optional | Contents
DIRECT | PRESENT | Yes | Boolean RLE
| DIRECT | No | Byte RLE

## Geometry Columns

Geometry data is encoded with a PRESENT stream, a DATA stream that records
the WKB-encoded geometry data as binary, and a LENGTH stream that records
the number of bytes per a value.

Encoding | Stream Kind | Optional | Contents
:------------ | :-------------- | :------- | :-------
DIRECT | PRESENT | Yes | Boolean RLE
| DATA | No | Binary contents
| LENGTH | No | Unsigned Integer RLE v1
DIRECT_V2 | PRESENT | Yes | Boolean RLE
| DATA | No | Binary contents
| LENGTH | No | Unsigned Integer RLE v2

# Indexes

## Row Group Index
Expand Down
141 changes: 141 additions & 0 deletions src/main/proto/orc/proto/orc_proto.proto
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,70 @@ message CollectionStatistics {
optional uint64 total_children = 3;
}

// Bounding box of geometries in the representation of min/max value pair of
// coordinates from each axis. Values of Z and M are omitted for 2D geometries.
// Filter pushdown on geometries are only safe for planar spatial predicate
// but it is recommended that the writer always generates bounding boxes
// regardless of whether the geometries are planar or spherical.
message BoundingBox {
required double xmin = 1;
required double xmax = 2;
required double ymin = 3;
required double ymax = 4;
optional double zmin = 5;
optional double zmax = 6;
optional double mmin = 7;
optional double mmax = 8;
}

// A custom binary-encoded polygon or multi-polygon to represent a covering of
// geometries. For example, it may be a bounding box or an envelope when a
// bounding box cannot be built (e.g., a geometry has spherical edges, or if
// an edge of geographic coordinates crosses the antimeridian). In addition,
// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids.
message Covering {
// A type of covering. Currently accepted values: "WKB".
optional string kind = 1;
// A payload specific to the kind. Below are the supported values:
// - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely
// covers the contents. This will be interpreted according to the same CRS
// and edges defined by the logical type.
optional bytes value = 2;
}

message GeometryStatistics {
// The bounding box of geometries in the column.
optional BoundingBox bbox = 1;
// List of coverings of geometries in the column.
repeated Covering coverings = 2;
// The geometry types of all geometries, or an empty array if they are not
// known. This is borrowed from `geometry_types` column metadata of GeoParquet [1]
// except that values in the list are WKB (ISO variant) integer codes [2]. Table
// below shows the most common geometry types and their codes:
//
// | Type | XY | XYZ | XYM | XYZM |
// | :----------------- | :--- | :--- | :--- | :--: |
// | Point | 0001 | 1001 | 2001 | 3001 |
// | LineString | 0002 | 1002 | 2002 | 3002 |
// | Polygon | 0003 | 1003 | 2003 | 3003 |
// | MultiPoint | 0004 | 1004 | 2004 | 3004 |
// | MultiLineString | 0005 | 1005 | 2005 | 3005 |
// | MultiPolygon | 0006 | 1006 | 2006 | 3006 |
// | GeometryCollection | 0007 | 1007 | 2007 | 3007 |
//
// In addition, the following rules are used:
// - A list of multiple values indicates that multiple geometry types are
// present (e.g. `[0003, 0006]`).
// - An empty array explicitly signals that the geometry types are not known.
// - The geometry types in the list must be unique (e.g. `[0001, 0001]`
// is not valid).
//
// Please refer to links below for more detail:
// [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary
// [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159
repeated int32 geometry_types = 3;
}

message ColumnStatistics {
optional uint64 number_of_values = 1;
optional IntegerStatistics int_statistics = 2;
Expand All @@ -97,6 +161,7 @@ message ColumnStatistics {
optional bool has_null = 10;
optional uint64 bytes_on_disk = 11;
optional CollectionStatistics collection_statistics = 12;
optional GeometryStatistics geometry_statistics = 13;
}

message RowIndexEntry {
Expand Down Expand Up @@ -195,6 +260,80 @@ message StringPair {
optional string value = 2;
}

message GeometryType {
enum GeometryEncoding {
// Well-known binary (WKB) representations of geometries.
//
// To be clear, we follow the same rule of WKB and coordinate axis order
// from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM
// and the standard geometry types (Point, LineString, Polygon, MultiPoint,
// MultiLineString, MultiPolygon, and GeometryCollection).
//
// [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92
// [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155
WKB = 0;
}
required GeometryEncoding encoding = 1;

// Interpretation for edges of non-point geometry objects, i.e. whether the
// edge between points represent a straight cartesian line or the shortest
// line on the sphere.
enum Edges {
PLANAR = 0;
SPHERICAL = 1;
}
required Edges edges = 2;

// Coordinate Reference System, i.e. mapping of how coordinates refer to
// precise locations on earth. Writers are not required to set this field.
// Once crs is set, crs_encoding field below MUST be set together.
// For example, "OGC:CRS84" can be set in the form of PROJJSON as below:
// {
// "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json",
// "type": "GeographicCRS",
// "name": "WGS 84 longitude-latitude",
// "datum": {
// "type": "GeodeticReferenceFrame",
// "name": "World Geodetic System 1984",
// "ellipsoid": {
// "name": "WGS 84",
// "semi_major_axis": 6378137,
// "inverse_flattening": 298.257223563
// }
// },
// "coordinate_system": {
// "subtype": "ellipsoidal",
// "axis": [
// {
// "name": "Geodetic longitude",
// "abbreviation": "Lon",
// "direction": "east",
// "unit": "degree"
// },
// {
// "name": "Geodetic latitude",
// "abbreviation": "Lat",
// "direction": "north",
// "unit": "degree"
// }
// ]
// },
// "id": {
// "authority": "OGC",
// "code": "CRS84"
// }
// }
//
optional string crs = 3;
// Encoding used in the above crs field. It MUST be set if crs field is set.
// Currently the only allowed value is "PROJJSON".
optional string crs_encoding = 4;

// Additional informative metadata about the geometry type.
// Recommended to write a JSON-encoded UTF-8 string.
optional string metadata = 5;
}

message Type {
enum Kind {
BOOLEAN = 0;
Expand All @@ -216,6 +355,7 @@ message Type {
VARCHAR = 16;
CHAR = 17;
TIMESTAMP_INSTANT = 18;
GEOMETRY = 19;
}
optional Kind kind = 1;
repeated uint32 subtypes = 2 [packed=true];
Expand All @@ -224,6 +364,7 @@ message Type {
optional uint32 precision = 5;
optional uint32 scale = 6;
repeated StringPair attributes = 7;
optional GeometryType geometry = 8;
}

message StripeInformation {
Expand Down
Loading