apache · aokolnychyi · Dec 24, 2022 · Dec 2, 2022 · Dec 19, 2022 · Dec 24, 2022
diff --git a/api/src/main/java/org/apache/iceberg/types/Types.java b/api/src/main/java/org/apache/iceberg/types/Types.java
@@ -554,6 +554,10 @@ public List<NestedField> fields() {
       return lazyFieldList();
     }
 
+    public boolean containsField(int id) {
+      return lazyFieldsById().containsKey(id);
+    }
+
     public NestedField field(String name) {
       return lazyFieldsByName().get(name);
     }

diff --git a/core/src/main/java/org/apache/iceberg/Partitioning.java b/core/src/main/java/org/apache/iceberg/Partitioning.java
@@ -200,6 +200,16 @@ public Void alwaysNull(int fieldId, String sourceName, int sourceId) {
   /**
    * Builds a grouping key type considering all provided specs.
    *
+   * @param specs one or many specs
+   * @return the constructed grouping key type
+   */
+  public static StructType groupingKeyType(Collection<PartitionSpec> specs) {
+    return groupingKeyType(null, specs);
+  }
+
+  /**
+   * Builds a grouping key type considering the provided schema and specs.
+   *
    * <p>A grouping key defines how data is split between files and consists of partition fields with
    * non-void transforms that are present in each provided spec. Iceberg guarantees that records
    * with different values for the grouping key are disjoint and are stored in separate files.
@@ -215,11 +225,15 @@ public Void alwaysNull(int fieldId, String sourceName, int sourceId) {
    * that have the same field ID but use a void transform under the hood. Such fields cannot be part
    * of the grouping key as void transforms always return null.
    *
+   * <p>If the provided schema is not null, this method will only take into account partition fields
+   * on top of columns present in the schema. Otherwise, all partition fields will be considered.
+   *
+   * @param schema a schema specifying a set of source columns to consider (null to consider all)
    * @param specs one or many specs
    * @return the constructed grouping key type
    */
-  public static StructType groupingKeyType(Collection<PartitionSpec> specs) {
-    return buildPartitionProjectionType("grouping key", specs, commonActiveFieldIds(specs));
+  public static StructType groupingKeyType(Schema schema, Collection<PartitionSpec> specs) {
+    return buildPartitionProjectionType("grouping key", specs, commonActiveFieldIds(schema, specs));
   }
 
   /**
@@ -341,15 +355,15 @@ private static Set<Integer> allFieldIds(Collection<PartitionSpec> specs) {
   }
 
   // collects IDs of partition fields with non-void transforms that are present in each spec
-  private static Set<Integer> commonActiveFieldIds(Collection<PartitionSpec> specs) {
+  private static Set<Integer> commonActiveFieldIds(Schema schema, Collection<PartitionSpec> specs) {
     Set<Integer> commonActiveFieldIds = Sets.newHashSet();
 
     int specIndex = 0;
     for (PartitionSpec spec : specs) {
       if (specIndex == 0) {
-        commonActiveFieldIds.addAll(activeFieldIds(spec));
+        commonActiveFieldIds.addAll(activeFieldIds(schema, spec));
       } else {
-        commonActiveFieldIds.retainAll(activeFieldIds(spec));
+        commonActiveFieldIds.retainAll(activeFieldIds(schema, spec));
       }
 
       specIndex++;
@@ -358,8 +372,9 @@ private static Set<Integer> commonActiveFieldIds(Collection<PartitionSpec> specs
     return commonActiveFieldIds;
   }
 
-  private static List<Integer> activeFieldIds(PartitionSpec spec) {
+  private static List<Integer> activeFieldIds(Schema schema, PartitionSpec spec) {
     return spec.fields().stream()
+        .filter(field -> schema == null || schema.findField(field.sourceId()) != null)
         .filter(field -> !isVoidTransform(field))
         .map(PartitionField::fieldId)
         .collect(Collectors.toList());

diff --git a/core/src/test/java/org/apache/iceberg/TestPartitioning.java b/core/src/test/java/org/apache/iceberg/TestPartitioning.java
@@ -354,6 +354,19 @@ public void testGroupingKeyTypeWithEvolvedUnpartitionedSpec() {
     Assert.assertEquals("Types must match", expectedType, actualType);
   }
 
+  @Test
+  public void testGroupingKeyTypeWithProjectedSchema() {
+    TestTables.TestTable table =
+        TestTables.create(tableDir, "test", SCHEMA, BY_CATEGORY_DATA_SPEC, V1_FORMAT_VERSION);
+
+    Schema projectedSchema = table.schema().select("id", "data");
+
+    StructType expectedType =
+        StructType.of(NestedField.optional(1001, "data", Types.StringType.get()));
+    StructType actualType = Partitioning.groupingKeyType(projectedSchema, table.specs().values());
+    Assert.assertEquals("Types must match", expectedType, actualType);
+  }
+
   @Test
   public void testGroupingKeyTypeWithIncompatibleSpecEvolution() {
     TestTables.TestTable table =

diff --git a/...ions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java b/...ions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java
@@ -404,15 +404,15 @@ public void testSparkTableAddDropPartitions() throws Exception {
     assertPartitioningEquals(sparkTable(), 1, "bucket(16, id)");
 
     sql("ALTER TABLE %s ADD PARTITION FIELD truncate(data, 4)", tableName);
-    assertPartitioningEquals(sparkTable(), 2, "truncate(data, 4)");
+    assertPartitioningEquals(sparkTable(), 2, "truncate(4, data)");
 
     sql("ALTER TABLE %s ADD PARTITION FIELD years(ts)", tableName);
     assertPartitioningEquals(sparkTable(), 3, "years(ts)");
 
     sql("ALTER TABLE %s DROP PARTITION FIELD years(ts)", tableName);
-    assertPartitioningEquals(sparkTable(), 2, "truncate(data, 4)");
+    assertPartitioningEquals(sparkTable(), 2, "truncate(4, data)");
 
-    sql("ALTER TABLE %s DROP PARTITION FIELD truncate(data, 4)", tableName);
+    sql("ALTER TABLE %s DROP PARTITION FIELD truncate(4, data)", tableName);
     assertPartitioningEquals(sparkTable(), 1, "bucket(16, id)");
 
     sql("ALTER TABLE %s DROP PARTITION FIELD shard", tableName);

diff --git a/...k-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/...k-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java
@@ -32,14 +32,19 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.RowLevelOperationMode;
+import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.data.GenericRecord;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
 import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.SparkSQLProperties;
 import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
+import org.apache.spark.sql.internal.SQLConf;
 import org.assertj.core.api.Assertions;
 import org.junit.Assert;
 import org.junit.Assume;
@@ -143,4 +148,32 @@ public synchronized void testDeleteWithConcurrentTableRefresh() throws Exception
     executorService.shutdown();
     Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
   }
+
+  @Test
+  public void testRuntimeFilteringWithReportedPartitioning() throws NoSuchTableException {
+    createAndInitPartitionedTable();
+
+    append(new Employee(1, "hr"), new Employee(3, "hr"));
+    append(new Employee(1, "hardware"), new Employee(2, "hardware"));
+
+    Map<String, String> sqlConf =
+        ImmutableMap.of(
+            SQLConf.V2_BUCKETING_ENABLED().key(),
+            "true",
+            SparkSQLProperties.PRESERVE_DATA_GROUPING,
+            "true");
+
+    withSQLConf(sqlConf, () -> sql("DELETE FROM %s WHERE id = 2", tableName));
+
+    Table table = validationCatalog.loadTable(tableIdent);
+    Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots()));
+
+    Snapshot currentSnapshot = table.currentSnapshot();
+    validateCopyOnWrite(currentSnapshot, "1", "1", "1");
+
+    assertEquals(
+        "Should have expected rows",
+        ImmutableList.of(row(1, "hardware"), row(1, "hr"), row(3, "hr")),
+        sql("SELECT * FROM %s ORDER BY id, dep", tableName));
+  }
 }
diff --git a/...rk-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/...rk-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java
@@ -32,14 +32,18 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.RowLevelOperationMode;
+import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.data.GenericRecord;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
 import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.SparkSQLProperties;
 import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.internal.SQLConf;
 import org.assertj.core.api.Assertions;
 import org.junit.Assert;
 import org.junit.Assume;
@@ -148,4 +152,45 @@ public synchronized void testMergeWithConcurrentTableRefresh() throws Exception
     executorService.shutdown();
     Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
   }
+
+  @Test
+  public void testRuntimeFilteringWithReportedPartitioning() {
+    createAndInitTable("id INT, dep STRING");
+    sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName);
+
+    append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }");
+    append(
+        tableName,
+        "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }");
+
+    createOrReplaceView("source", Collections.singletonList(2), Encoders.INT());
+
+    Map<String, String> sqlConf =
+        ImmutableMap.of(
+            SQLConf.V2_BUCKETING_ENABLED().key(),
+            "true",
+            SparkSQLProperties.PRESERVE_DATA_GROUPING,
+            "true");
+
+    withSQLConf(
+        sqlConf,
+        () ->
+            sql(
+                "MERGE INTO %s t USING source s "
+                    + "ON t.id == s.value "
+                    + "WHEN MATCHED THEN "
+                    + "  UPDATE SET id = -1",
+                tableName));
+
+    Table table = validationCatalog.loadTable(tableIdent);
+    Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots()));
+
+    Snapshot currentSnapshot = table.currentSnapshot();
+    validateCopyOnWrite(currentSnapshot, "1", "1", "1");
+
+    assertEquals(
+        "Should have expected rows",
+        ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")),
+        sql("SELECT * FROM %s ORDER BY id, dep", tableName));
+  }
 }
diff --git a/...k-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/...k-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java
@@ -31,13 +31,17 @@
 import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.RowLevelOperationMode;
+import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.data.GenericRecord;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors;
 import org.apache.iceberg.spark.Spark3Util;
+import org.apache.iceberg.spark.SparkSQLProperties;
+import org.apache.spark.sql.internal.SQLConf;
 import org.assertj.core.api.Assertions;
 import org.junit.Assert;
 import org.junit.Assume;
@@ -140,4 +144,35 @@ public synchronized void testUpdateWithConcurrentTableRefresh() throws Exception
     executorService.shutdown();
     Assert.assertTrue("Timeout", executorService.awaitTermination(2, TimeUnit.MINUTES));
   }
+
+  @Test
+  public void testRuntimeFilteringWithReportedPartitioning() {
+    createAndInitTable("id INT, dep STRING");
+    sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName);
+
+    append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }");
+    append(
+        tableName,
+        "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }");
+
+    Map<String, String> sqlConf =
+        ImmutableMap.of(
+            SQLConf.V2_BUCKETING_ENABLED().key(),
+            "true",
+            SparkSQLProperties.PRESERVE_DATA_GROUPING,
+            "true");
+
+    withSQLConf(sqlConf, () -> sql("UPDATE %s SET id = cast('-1' AS INT) WHERE id = 2", tableName));
+
+    Table table = validationCatalog.loadTable(tableIdent);
+    Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots()));
+
+    Snapshot currentSnapshot = table.currentSnapshot();
+    validateCopyOnWrite(currentSnapshot, "1", "1", "1");
+
+    assertEquals(
+        "Should have expected rows",
+        ImmutableList.of(row(-1, "hardware"), row(1, "hardware"), row(1, "hr"), row(3, "hr")),
+        sql("SELECT * FROM %s ORDER BY id, dep", tableName));
+  }
 }