apache · tustvold · Jul 26, 2022 · Jul 12, 2022 · Jul 12, 2022 · Jul 12, 2022
diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs
@@ -37,11 +37,22 @@ fn create_binary_array_data(length: i32) -> ArrayData {
     .unwrap()
 }
 
-fn array_slice_benchmark(c: &mut Criterion) {
+fn validate_utf8_array(arr: &StringArray) {
+    arr.data().validate_values().unwrap();
+}
+
+fn validate_benchmark(c: &mut Criterion) {
+    //Binary Array
     c.bench_function("validate_binary_array_data 20000", |b| {
         b.iter(|| create_binary_array_data(20000))
     });
+
+    //Utf8 Array
+    let str_arr = StringArray::from(vec!["test"; 20000]);
+    c.bench_function("validate_utf8_array_data 20000", |b| {
+        b.iter(|| validate_utf8_array(&str_arr))
+    });
 }
 
-criterion_group!(benches, array_slice_benchmark);
+criterion_group!(benches, validate_benchmark);
 criterion_main!(benches);
diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
@@ -1125,16 +1125,37 @@ impl ArrayData {
         T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
     {
         let values_buffer = &self.buffers[1].as_slice();
-
-        self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
-            std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
-                ArrowError::InvalidArgumentError(format!(
-                    "Invalid UTF8 sequence at string index {} ({:?}): {}",
-                    string_index, range, e
-                ))
-            })?;
-            Ok(())
-        })
+        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
+            // Validate Offsets are correct
+            self.validate_each_offset::<T, _>(
+                values_buffer.len(),
+                |string_index, range| {
+                    if !values_str.is_char_boundary(range.start)
+                        || !values_str.is_char_boundary(range.end)
+                    {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "incomplete utf-8 byte sequence from index {}",
+                            string_index
+                        )));
+                    }
+                    Ok(())
+                },
+            )
+        } else {
+            // find specific offset that failed utf8 validation
+            self.validate_each_offset::<T, _>(
+                values_buffer.len(),
+                |string_index, range| {
+                    std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
+                        ArrowError::InvalidArgumentError(format!(
+                            "Invalid UTF8 sequence at string index {} ({:?}): {}",
+                            string_index, range, e
+                        ))
+                    })?;
+                    Ok(())
+                },
+            )
+        }
     }
 
     /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are