diff --git a/specification/ORCv2.md b/specification/ORCv2.md index 3411485..d3fbd31 100644 --- a/specification/ORCv2.md +++ b/specification/ORCv2.md @@ -261,6 +261,7 @@ message Type { VARCHAR = 16; CHAR = 17; TIMESTAMP_INSTANT = 18; + GEOMETRY = 19; } // the kind of this type required Kind kind = 1; @@ -273,6 +274,91 @@ message Type { // the precision and scale for decimal optional uint32 precision = 5; optional uint32 scale = 6; + repeated StringPair attributes = 7; + // the attributes associated with the geometry type + optional GeometryType geometry = 8; +} +``` + +#### Geometry Type + +Geometry type requires additional information as described in the GeometryType +message below. These attributes limit the scope of geospatial features that +we can support for now. + +``` +message GeometryType { + enum GeometryEncoding { + // Well-known binary (WKB) representations of geometries. + // + // To be clear, we follow the same rule of WKB and coordinate axis order + // from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM + // and the standard geometry types (Point, LineString, Polygon, MultiPoint, + // MultiLineString, MultiPolygon, and GeometryCollection). + // + // [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + WKB = 0; + } + required GeospatialEncoding encoding = 1; + + // Interpretation for edges of non-point geometry objects, i.e. whether the + // edge between points represent a straight cartesian line or the shortest + // line on the sphere. + enum Edges { + PLANAR = 0; + SPHERICAL = 1; + } + required Edges edges = 2; + + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. Writers are not required to set this field. + // Once crs is set, crs_encoding field below MUST be set together. + // For example, "OGC:CRS84" can be set in the form of PROJJSON as below: + // { + // "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + // "type": "GeographicCRS", + // "name": "WGS 84 longitude-latitude", + // "datum": { + // "type": "GeodeticReferenceFrame", + // "name": "World Geodetic System 1984", + // "ellipsoid": { + // "name": "WGS 84", + // "semi_major_axis": 6378137, + // "inverse_flattening": 298.257223563 + // } + // }, + // "coordinate_system": { + // "subtype": "ellipsoidal", + // "axis": [ + // { + // "name": "Geodetic longitude", + // "abbreviation": "Lon", + // "direction": "east", + // "unit": "degree" + // }, + // { + // "name": "Geodetic latitude", + // "abbreviation": "Lat", + // "direction": "north", + // "unit": "degree" + // } + // ] + // }, + // "id": { + // "authority": "OGC", + // "code": "CRS84" + // } + // } + // + optional string crs = 3; + // Encoding used in the above crs field. It MUST be set if crs field is set. + // Currently the only allowed value is "PROJJSON". + optional string crs_encoding = 4; + + // Additional informative metadata about the geometry type. + // Recommended to write a JSON-encoded UTF-8 string. + optional string metadata = 5; } ``` @@ -301,6 +387,9 @@ message ColumnStatistics { optional BinaryStatistics binaryStatistics = 8; optional TimestampStatistics timestampStatistics = 9; optional bool hasNull = 10; + optional uint64 bytes_on_disk = 11; + optional CollectionStatistics collection_statistics = 12; + optional GeometryStatistics geometry_statistics = 13; } ``` @@ -395,6 +484,75 @@ message BinaryStatistics { } ``` +Geometry columns store optional bounding boxes, coverings and list of +geometry type codes from all values. + +``` +// Bounding box of geometries in the representation of min/max value pair of +// coordinates from each axis. Values of Z and M are omitted for 2D geometries. +// Filter pushdown on geometries are only safe for planar spatial predicate +// but it is recommended that the writer always generates bounding boxes +// regardless of whether the geometries are planar or spherical. +message BoundingBox { + required double xmin = 1; + required double xmax = 2; + required double ymin = 3; + required double ymax = 4; + optional double zmin = 5; + optional double zmax = 6; + optional double mmin = 7; + optional double mmax = 8; +} + +// A custom binary-encoded polygon or multi-polygon to represent a covering of +// geometries. For example, it may be a bounding box or an envelope when a +// bounding box cannot be built (e.g., a geometry has spherical edges, or if +// an edge of geographic coordinates crosses the antimeridian). In addition, +// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids. +message Covering { + // A type of covering. Currently accepted values: "WKB". + optional string kind = 1; + // A payload specific to the kind. Below are the supported values: + // - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely + // covers the contents. This will be interpreted according to the same CRS + // and edges defined by the logical type. + optional bytes value = 2; +} + +message GeometryStatistics { + // The bounding box of geometries in the column. + optional BoundingBox bbox = 1; + // List of coverings of geometries in the column. + repeated Covering coverings = 2; + // The geometry types of all geometries, or an empty array if they are not + // known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + // except that values in the list are WKB (ISO variant) integer codes [2]. Table + // below shows the most common geometry types and their codes: + // + // | Type | XY | XYZ | XYM | XYZM | + // | :----------------- | :--- | :--- | :--- | :--: | + // | Point | 0001 | 1001 | 2001 | 3001 | + // | LineString | 0002 | 1002 | 2002 | 3002 | + // | Polygon | 0003 | 1003 | 2003 | 3003 | + // | MultiPoint | 0004 | 1004 | 2004 | 3004 | + // | MultiLineString | 0005 | 1005 | 2005 | 3005 | + // | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + // | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + // + // In addition, the following rules are used: + // - A list of multiple values indicates that multiple geometry types are + // present (e.g. `[0003, 0006]`). + // - An empty array explicitly signals that the geometry types are not known. + // - The geometry types in the list must be unique (e.g. `[0001, 0001]` + // is not valid). + // + // Please refer to links below for more detail: + // [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 + repeated int32 geometry_types = 3; +} +``` + ### User Metadata The user can add arbitrary key/value pairs to an ORC file as it is @@ -1233,6 +1391,21 @@ Encoding | Stream Kind | Optional | Contents DIRECT | PRESENT | Yes | Boolean RLE | DIRECT | No | Byte RLE +## Geometry Columns + +Geometry data is encoded with a PRESENT stream, a DATA stream that records +the WKB-encoded geometry data as binary, and a LENGTH stream that records +the number of bytes per a value. + +Encoding | Stream Kind | Optional | Contents +:------------ | :-------------- | :------- | :------- +DIRECT | PRESENT | Yes | Boolean RLE + | DATA | No | Binary contents + | LENGTH | No | Unsigned Integer RLE v1 +DIRECT_V2 | PRESENT | Yes | Boolean RLE + | DATA | No | Binary contents + | LENGTH | No | Unsigned Integer RLE v2 + # Indexes ## Row Group Index diff --git a/src/main/proto/orc/proto/orc_proto.proto b/src/main/proto/orc/proto/orc_proto.proto index 16c5523..d57b5d8 100644 --- a/src/main/proto/orc/proto/orc_proto.proto +++ b/src/main/proto/orc/proto/orc_proto.proto @@ -84,6 +84,70 @@ message CollectionStatistics { optional uint64 total_children = 3; } +// Bounding box of geometries in the representation of min/max value pair of +// coordinates from each axis. Values of Z and M are omitted for 2D geometries. +// Filter pushdown on geometries are only safe for planar spatial predicate +// but it is recommended that the writer always generates bounding boxes +// regardless of whether the geometries are planar or spherical. +message BoundingBox { + required double xmin = 1; + required double xmax = 2; + required double ymin = 3; + required double ymax = 4; + optional double zmin = 5; + optional double zmax = 6; + optional double mmin = 7; + optional double mmax = 8; +} + +// A custom binary-encoded polygon or multi-polygon to represent a covering of +// geometries. For example, it may be a bounding box or an envelope when a +// bounding box cannot be built (e.g., a geometry has spherical edges, or if +// an edge of geographic coordinates crosses the antimeridian). In addition, +// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids. +message Covering { + // A type of covering. Currently accepted values: "WKB". + optional string kind = 1; + // A payload specific to the kind. Below are the supported values: + // - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely + // covers the contents. This will be interpreted according to the same CRS + // and edges defined by the logical type. + optional bytes value = 2; +} + +message GeometryStatistics { + // The bounding box of geometries in the column. + optional BoundingBox bbox = 1; + // List of coverings of geometries in the column. + repeated Covering coverings = 2; + // The geometry types of all geometries, or an empty array if they are not + // known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + // except that values in the list are WKB (ISO variant) integer codes [2]. Table + // below shows the most common geometry types and their codes: + // + // | Type | XY | XYZ | XYM | XYZM | + // | :----------------- | :--- | :--- | :--- | :--: | + // | Point | 0001 | 1001 | 2001 | 3001 | + // | LineString | 0002 | 1002 | 2002 | 3002 | + // | Polygon | 0003 | 1003 | 2003 | 3003 | + // | MultiPoint | 0004 | 1004 | 2004 | 3004 | + // | MultiLineString | 0005 | 1005 | 2005 | 3005 | + // | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + // | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + // + // In addition, the following rules are used: + // - A list of multiple values indicates that multiple geometry types are + // present (e.g. `[0003, 0006]`). + // - An empty array explicitly signals that the geometry types are not known. + // - The geometry types in the list must be unique (e.g. `[0001, 0001]` + // is not valid). + // + // Please refer to links below for more detail: + // [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 + repeated int32 geometry_types = 3; +} + message ColumnStatistics { optional uint64 number_of_values = 1; optional IntegerStatistics int_statistics = 2; @@ -97,6 +161,7 @@ message ColumnStatistics { optional bool has_null = 10; optional uint64 bytes_on_disk = 11; optional CollectionStatistics collection_statistics = 12; + optional GeometryStatistics geometry_statistics = 13; } message RowIndexEntry { @@ -195,6 +260,80 @@ message StringPair { optional string value = 2; } +message GeometryType { + enum GeometryEncoding { + // Well-known binary (WKB) representations of geometries. + // + // To be clear, we follow the same rule of WKB and coordinate axis order + // from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM + // and the standard geometry types (Point, LineString, Polygon, MultiPoint, + // MultiLineString, MultiPolygon, and GeometryCollection). + // + // [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + WKB = 0; + } + required GeometryEncoding encoding = 1; + + // Interpretation for edges of non-point geometry objects, i.e. whether the + // edge between points represent a straight cartesian line or the shortest + // line on the sphere. + enum Edges { + PLANAR = 0; + SPHERICAL = 1; + } + required Edges edges = 2; + + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. Writers are not required to set this field. + // Once crs is set, crs_encoding field below MUST be set together. + // For example, "OGC:CRS84" can be set in the form of PROJJSON as below: + // { + // "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + // "type": "GeographicCRS", + // "name": "WGS 84 longitude-latitude", + // "datum": { + // "type": "GeodeticReferenceFrame", + // "name": "World Geodetic System 1984", + // "ellipsoid": { + // "name": "WGS 84", + // "semi_major_axis": 6378137, + // "inverse_flattening": 298.257223563 + // } + // }, + // "coordinate_system": { + // "subtype": "ellipsoidal", + // "axis": [ + // { + // "name": "Geodetic longitude", + // "abbreviation": "Lon", + // "direction": "east", + // "unit": "degree" + // }, + // { + // "name": "Geodetic latitude", + // "abbreviation": "Lat", + // "direction": "north", + // "unit": "degree" + // } + // ] + // }, + // "id": { + // "authority": "OGC", + // "code": "CRS84" + // } + // } + // + optional string crs = 3; + // Encoding used in the above crs field. It MUST be set if crs field is set. + // Currently the only allowed value is "PROJJSON". + optional string crs_encoding = 4; + + // Additional informative metadata about the geometry type. + // Recommended to write a JSON-encoded UTF-8 string. + optional string metadata = 5; +} + message Type { enum Kind { BOOLEAN = 0; @@ -216,6 +355,7 @@ message Type { VARCHAR = 16; CHAR = 17; TIMESTAMP_INSTANT = 18; + GEOMETRY = 19; } optional Kind kind = 1; repeated uint32 subtypes = 2 [packed=true]; @@ -224,6 +364,7 @@ message Type { optional uint32 precision = 5; optional uint32 scale = 6; repeated StringPair attributes = 7; + optional GeometryType geometry = 8; } message StripeInformation {