apache · alamb · Oct 28, 2023 · Oct 21, 2023 · Oct 21, 2023 · Oct 21, 2023
diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
@@ -420,6 +420,15 @@ impl DFSchema {
                 Self::datatype_is_semantically_equal(k1.as_ref(), k2.as_ref())
                     && Self::datatype_is_semantically_equal(v1.as_ref(), v2.as_ref())
             }
+            // The next two cases allow for the possibility that one schema has a dictionary encoded array
+            // and the other has an equivalent non dictionary encoded array of the same type
+            // E.g. Dictionary(_, Utf8) is semantically equivalent to Utf8 since both represent an array of strings
+            (DataType::Dictionary(_, v1), othertype) => {
+                v1.as_ref() == othertype
+            }
+            (othertype, DataType::Dictionary(_, v1)) => {
+                v1.as_ref() == othertype
+            }
             (DataType::List(f1), DataType::List(f2))
             | (DataType::LargeList(f1), DataType::LargeList(f2))
             | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))

diff --git a/datafusion/core/src/datasource/file_format/write/demux.rs b/datafusion/core/src/datasource/file_format/write/demux.rs
@@ -28,8 +28,9 @@ use crate::error::Result;
 use crate::physical_plan::SendableRecordBatchStream;
 
 use arrow_array::builder::UInt64Builder;
-use arrow_array::cast::AsArray;
-use arrow_array::{RecordBatch, StructArray};
+use arrow_array::cast::{as_dictionary_array, AsArray};
+use arrow_array::types::{Int32Type, UInt16Type};
+use arrow_array::{RecordBatch, StringArray, StructArray};
 use arrow_schema::{DataType, Schema};
 use datafusion_common::cast::as_string_array;
 use datafusion_common::DataFusionError;
@@ -310,7 +311,37 @@ fn compute_partition_keys_by_row<'a>(
                 for i in 0..rb.num_rows() {
                     partition_values.push(array.value(i));
                 }
-            }
+            },
+            DataType::Dictionary(key_type, _) => {
+                match **key_type{
+                    DataType::UInt16 => {
+                        let dict_array = as_dictionary_array::<UInt16Type>(col_array);
+                        let array = dict_array.downcast_dict::<StringArray>()
+                            .ok_or(DataFusionError::NotImplemented(format!("It is not yet supported to write to hive partitioned with datatype {}", dtype)))?;
+                        for val in array.into_iter() {
+                            partition_values.push(
+                                val.ok_or(DataFusionError::Execution("Partition values cannot be null!".into()))?
+                            );
+                        }
+                    },
+                    DataType::Int32 => {
+                        let dict_array = as_dictionary_array::<Int32Type>(col_array);
+                        let array = dict_array.downcast_dict::<StringArray>()
+                            .ok_or(DataFusionError::NotImplemented(format!("It is not yet supported to write to hive partitioned with datatype {}", dtype)))?;
+                        for val in array.into_iter() {
+                            partition_values.push(
+                                val.ok_or(DataFusionError::Execution("Partition values cannot be null!".into()))?
+                            );
+                        }
+                    },
+                    _ => {
+                        return Err(DataFusionError::NotImplemented(format!(
+                            "It is not yet supported to write to hive partitions with datatype {}",
+                            dtype
+                        )))
+                    }
+                }
+            },
             _ => {
                 return Err(DataFusionError::NotImplemented(format!(
                 "it is not yet supported to write to hive partitions with datatype {}",