From 22930e8ee0b0ef0aa6ea655ef9ab559dbdc75d1f Mon Sep 17 00:00:00 2001 From: David Sisson Date: Mon, 29 Apr 2024 18:21:25 -0700 Subject: [PATCH] Updated documentation to be more clear, removed the mark reference as the act of marking is internal to the join. --- proto/substrait/algebra.proto | 9 ++++----- site/docs/relations/physical_relations.md | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/proto/substrait/algebra.proto b/proto/substrait/algebra.proto index fb6db0905..cdeb0c926 100644 --- a/proto/substrait/algebra.proto +++ b/proto/substrait/algebra.proto @@ -732,8 +732,10 @@ message NestedLoopJoinRel { substrait.extensions.AdvancedExtension advanced_extension = 10; } -// A mark join utilizes a previously applied mark to greatly reduce the -// input to be processed. A mark is a nullable boolean field. +// A mark join internally scans the left side, constructing a hash table that +// is used to mark the right side as having a join partner on the left side. A +// mark is a nullable boolean field. The mark join operator is used to +// implement semi-joins, anti-joins, and other join types that are not equijoins. message MarkJoinRel { RelCommon common = 1; Rel left = 2; @@ -741,9 +743,6 @@ message MarkJoinRel { // optional, defaults to true (a cartesian join) Expression expression = 4; - // A reference to the mark field. - Expression.FieldReference mark_field = 6; - substrait.extensions.AdvancedExtension advanced_extension = 10; } diff --git a/site/docs/relations/physical_relations.md b/site/docs/relations/physical_relations.md index 8aeeeb4b6..e06af3bd6 100644 --- a/site/docs/relations/physical_relations.md +++ b/site/docs/relations/physical_relations.md @@ -75,7 +75,7 @@ The merge equijoin does a join by taking advantage of two sets that are sorted o ## Mark Join Operator -A mark join utilizes a previously applied mark to greatly reduce the input to be processed. +A mark join internally scans the left side, constructing a hash table that is used to mark the right side as having a join partner on the left side. This mark can end up being True, False, or NULL. The NULL mark is used to indicate that the right side does not have a join partner on the left side. The mark join operator is used to implement semi-joins, anti-joins, and other join types that are not equijoins. | Signature | Value | | -------------------- | ------------------------------------------------------------ | @@ -90,7 +90,6 @@ A mark join utilizes a previously applied mark to greatly reduce the input to be |-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------| | Left Input | A relational input. | Required | | Right Input | A relational input. | Required | -| Mark Reference | A nullable boolean field reference that is used to filter the right input. If the mark is null, the row is not included in the join. | Required. | | Join Expression | A boolean condition that describes whether each record from the left set "match" the record from the right set. Field references correspond to the direct output order of the data. | Required. Can be (but not expected to be) the literal True. |