apache · tustvold · May 29, 2023 · May 25, 2023 · May 25, 2023 · May 26, 2023
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
@@ -147,13 +147,13 @@ impl<W: Write + Send> ArrowWriter<W> {
         self.writer.flushed_row_groups()
     }
 
-    /// Returns the length in bytes of the current in progress row group
+    /// Returns the estimated length in bytes of the current in progress row group
     pub fn in_progress_size(&self) -> usize {
         match &self.in_progress {
             Some(in_progress) => in_progress
                 .writers
                 .iter()
-                .map(|(x, _)| x.lock().unwrap().length)
+                .map(|(_, x)| x.get_estimated_total_bytes() as usize)
                 .sum(),
             None => 0,
         }
@@ -354,6 +354,16 @@ enum ArrowColumnWriter {
     Column(ColumnWriter<'static>),
 }
 
+impl ArrowColumnWriter {
+    /// Returns the estimated total bytes for this column writer
+    fn get_estimated_total_bytes(&self) -> u64 {
+        match self {
+            ArrowColumnWriter::ByteArray(c) => c.get_estimated_total_bytes(),
+            ArrowColumnWriter::Column(c) => c.get_estimated_total_bytes(),
+        }
+    }
+}
+
 /// Encodes [`RecordBatch`] to a parquet row group
 struct ArrowRowGroupWriter {
     writers: Vec<(SharedColumnChunk, ArrowColumnWriter)>,
@@ -2531,4 +2541,40 @@ mod tests {
         assert_ne!(back.schema(), batch.schema());
         assert_eq!(back.column(0).as_ref(), batch.column(0).as_ref());
     }
+
+    #[test]
+    fn in_progress_accounting() {
+        // define schema
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+
+        // create some data
+        let a = Int32Array::from(vec![1, 2, 3, 4, 5]);
+
+        // build a record batch
+        let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap();
+
+        let mut writer = ArrowWriter::try_new(vec![], batch.schema(), None).unwrap();
+
+        // starts empty
+        assert_eq!(writer.in_progress_size(), 0);
+        assert_eq!(writer.in_progress_rows(), 0);
+        writer.write(&batch).unwrap();
+
+        // updated on write
+        let initial_size = writer.in_progress_size();
+        assert!(initial_size > 0);
+        assert_eq!(writer.in_progress_rows(), 5);
+
+        // updated on second write
+        writer.write(&batch).unwrap();
+        assert!(writer.in_progress_size() > initial_size);
+        assert_eq!(writer.in_progress_rows(), 10);
+
+        // cleared on flush
+        writer.flush().unwrap();
+        assert_eq!(writer.in_progress_size(), 0);
+        assert_eq!(writer.in_progress_rows(), 0);
+
+        writer.close().unwrap();
+    }
 }
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
@@ -43,6 +43,21 @@ use crate::util::memory::ByteBufferPtr;
 
 pub(crate) mod encoder;
 
+macro_rules! downcast_writer {
+    ($e:expr, $i:ident, $b:expr) => {
+        match $e {
+            Self::BoolColumnWriter($i) => $b,
+            Self::Int32ColumnWriter($i) => $b,
+            Self::Int64ColumnWriter($i) => $b,
+            Self::Int96ColumnWriter($i) => $b,
+            Self::FloatColumnWriter($i) => $b,
+            Self::DoubleColumnWriter($i) => $b,
+            Self::ByteArrayColumnWriter($i) => $b,
+            Self::FixedLenByteArrayColumnWriter($i) => $b,
+        }
+    };
+}
+
 /// Column writer for a Parquet type.
 pub enum ColumnWriter<'a> {
     BoolColumnWriter(ColumnWriterImpl<'a, BoolType>),
@@ -56,18 +71,14 @@ pub enum ColumnWriter<'a> {
 }
 
 impl<'a> ColumnWriter<'a> {
+    /// Returns the estimated total bytes for this column writer
+    pub(crate) fn get_estimated_total_bytes(&self) -> u64 {
+        downcast_writer!(self, typed, typed.get_estimated_total_bytes())
+    }
+
     /// Close this [`ColumnWriter`]
     pub fn close(self) -> Result<ColumnCloseResult> {
-        match self {
-            Self::BoolColumnWriter(typed) => typed.close(),
-            Self::Int32ColumnWriter(typed) => typed.close(),
-            Self::Int64ColumnWriter(typed) => typed.close(),
-            Self::Int96ColumnWriter(typed) => typed.close(),
-            Self::FloatColumnWriter(typed) => typed.close(),
-            Self::DoubleColumnWriter(typed) => typed.close(),
-            Self::ByteArrayColumnWriter(typed) => typed.close(),
-            Self::FixedLenByteArrayColumnWriter(typed) => typed.close(),
-        }
+        downcast_writer!(self, typed, typed.close())
     }
 }
 
@@ -441,6 +452,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
         self.column_metrics.total_bytes_written
     }
 
+    /// Returns the estimated total bytes for this column writer
+    ///
+    /// Unlike [`Self::get_total_bytes_written`] this includes an estimate
+    /// of any data that has not yet been flushed to a pge
+    pub(crate) fn get_estimated_total_bytes(&self) -> u64 {
+        self.column_metrics.total_bytes_written
+            + self.encoder.estimated_data_page_size() as u64
+            + self.encoder.estimated_dict_page_size().unwrap_or_default() as u64
+    }
+
     /// Returns total number of rows written by this column writer so far.
     /// This value is also returned when column writer is closed.
     pub fn get_total_rows_written(&self) -> u64 {