From 4e08f2f85d8a63f6565c1a84b85a3b25bcad5588 Mon Sep 17 00:00:00 2001 From: Lukasz Anforowicz Date: Mon, 29 Jan 2024 19:07:34 +0000 Subject: [PATCH 1/6] Extract a separate `palette.rs` module. This commit moves `expand_paletted_into_rgb8` and `expand_paletted_into_rgba8` (and their unit tests) into a separate `transform/palette.rs` module. This prepares room for encapsulating extra complexity in this module in follow-up commits, where we will start to precompute and memoize some data when creating a `TransformFn`. This commit just moves the code around - it should have no impact on correctness or performance. --- src/decoder/transform.rs | 205 ++----------------------------- src/decoder/transform/palette.rs | 195 +++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+), 196 deletions(-) create mode 100644 src/decoder/transform/palette.rs diff --git a/src/decoder/transform.rs b/src/decoder/transform.rs index 0cbc0740..cc89ac99 100644 --- a/src/decoder/transform.rs +++ b/src/decoder/transform.rs @@ -1,5 +1,7 @@ //! Transforming a decompressed, unfiltered row into the final output. +mod palette; + use crate::{BitDepth, ColorType, DecodingError, Info, Transformations}; use super::stream::FormatErrorInner; @@ -42,9 +44,9 @@ pub fn create_transform_fn( )); } else { if trns { - Ok(expand_paletted_into_rgba8) + Ok(palette::expand_paletted_into_rgba8) } else { - Ok(expand_paletted_into_rgb8) + Ok(palette::expand_paletted_into_rgb8) } } } @@ -132,7 +134,7 @@ where } } -pub fn expand_trns_line(input: &[u8], output: &mut [u8], info: &Info) { +fn expand_trns_line(input: &[u8], output: &mut [u8], info: &Info) { let channels = info.color_type.samples(); let trns = info.trns.as_deref(); for (input, output) in input @@ -144,7 +146,7 @@ pub fn expand_trns_line(input: &[u8], output: &mut [u8], info: &Info) { } } -pub fn expand_trns_line16(input: &[u8], output: &mut [u8], info: &Info) { +fn expand_trns_line16(input: &[u8], output: &mut [u8], info: &Info) { let channels = info.color_type.samples(); let trns = info.trns.as_deref(); for (input, output) in input @@ -162,7 +164,7 @@ pub fn expand_trns_line16(input: &[u8], output: &mut [u8], info: &Info) { } } -pub fn expand_trns_and_strip_line16(input: &[u8], output: &mut [u8], info: &Info) { +fn expand_trns_and_strip_line16(input: &[u8], output: &mut [u8], info: &Info) { let channels = info.color_type.samples(); let trns = info.trns.as_deref(); for (input, output) in input @@ -176,60 +178,14 @@ pub fn expand_trns_and_strip_line16(input: &[u8], output: &mut [u8], info: &Info } } -pub fn expand_paletted_into_rgb8(row: &[u8], buffer: &mut [u8], info: &Info) { - let palette = info.palette.as_deref().expect("Caller should verify"); - let black = [0, 0, 0]; - - unpack_bits(row, buffer, 3, info.bit_depth as u8, |i, chunk| { - let rgb = palette - .get(3 * i as usize..3 * i as usize + 3) - .unwrap_or(&black); - chunk[0] = rgb[0]; - chunk[1] = rgb[1]; - chunk[2] = rgb[2]; - }) -} - -pub fn expand_paletted_into_rgba8(row: &[u8], buffer: &mut [u8], info: &Info) { - let palette = info.palette.as_deref().expect("Caller should verify"); - let trns = info.trns.as_deref().unwrap_or(&[]); - let black = [0, 0, 0]; - - // > The tRNS chunk shall not contain more alpha values than there are palette - // entries, but a tRNS chunk may contain fewer values than there are palette - // entries. In this case, the alpha value for all remaining palette entries is - // assumed to be 255. - // - // It seems, accepted reading is to fully *ignore* an invalid tRNS as if it were - // completely empty / all pixels are non-transparent. - let trns = if trns.len() <= palette.len() / 3 { - trns - } else { - &[] - }; - - unpack_bits(row, buffer, 4, info.bit_depth as u8, |i, chunk| { - let (rgb, a) = ( - palette - .get(3 * i as usize..3 * i as usize + 3) - .unwrap_or(&black), - *trns.get(i as usize).unwrap_or(&0xFF), - ); - chunk[0] = rgb[0]; - chunk[1] = rgb[1]; - chunk[2] = rgb[2]; - chunk[3] = a; - }); -} - -pub fn expand_gray_u8(row: &[u8], buffer: &mut [u8], info: &Info) { +fn expand_gray_u8(row: &[u8], buffer: &mut [u8], info: &Info) { let scaling_factor = (255) / ((1u16 << info.bit_depth as u8) - 1) as u8; unpack_bits(row, buffer, 1, info.bit_depth as u8, |val, chunk| { chunk[0] = val * scaling_factor }); } -pub fn expand_gray_u8_with_trns(row: &[u8], buffer: &mut [u8], info: &Info) { +fn expand_gray_u8_with_trns(row: &[u8], buffer: &mut [u8], info: &Info) { let scaling_factor = (255) / ((1u16 << info.bit_depth as u8) - 1) as u8; let trns = info.trns.as_deref(); unpack_bits(row, buffer, 2, info.bit_depth as u8, |pixel, chunk| { @@ -245,146 +201,3 @@ pub fn expand_gray_u8_with_trns(row: &[u8], buffer: &mut [u8], info: &Info) { chunk[0] = pixel * scaling_factor }); } - -#[cfg(test)] -mod test { - use crate::{BitDepth, ColorType, Info, Transformations}; - - fn expand_paletted( - src: &[u8], - src_bit_depth: u8, - palette: &[u8], - trns: Option<&[u8]>, - ) -> Vec { - let info = Info { - color_type: ColorType::Indexed, - bit_depth: BitDepth::from_u8(src_bit_depth).unwrap(), - palette: Some(palette.into()), - trns: trns.map(Into::into), - ..Info::default() - }; - let output_bytes_per_input_sample = match trns { - None => 3, - Some(_) => 4, - }; - let samples_count_per_byte = (8 / src_bit_depth) as usize; - let samples_count = src.len() * samples_count_per_byte; - let mut dst = vec![0; samples_count * output_bytes_per_input_sample]; - let transform_fn = super::create_transform_fn(&info, Transformations::EXPAND).unwrap(); - transform_fn(src, dst.as_mut_slice(), &info); - dst - } - - #[test] - fn test_expand_paletted_rgba_8bit() { - let actual = expand_paletted( - &[0, 1, 2, 3], // src - 8, // src_bit_depth - &[ - // palette - 0, 1, 2, // entry #0 - 4, 5, 6, // entry #1 - 8, 9, 10, // entry #2 - 12, 13, 14, // entry #3 - ], - Some(&[3, 7, 11, 15]), // trns - ); - assert_eq!(actual, (0..16).collect::>()); - } - - #[test] - fn test_expand_paletted_rgb_8bit() { - let actual = expand_paletted( - &[0, 1, 2, 3], // src - 8, // src_bit_depth - &[ - // palette - 0, 1, 2, // entry #0 - 3, 4, 5, // entry #1 - 6, 7, 8, // entry #2 - 9, 10, 11, // entry #3 - ], - None, // trns - ); - assert_eq!(actual, (0..12).collect::>()); - } - - #[test] - fn test_expand_paletted_rgba_4bit() { - let actual = expand_paletted( - &[0x01, 0x23], // src - 4, // src_bit_depth - &[ - // palette - 0, 1, 2, // entry #0 - 4, 5, 6, // entry #1 - 8, 9, 10, // entry #2 - 12, 13, 14, // entry #3 - ], - Some(&[3, 7, 11, 15]), // trns - ); - assert_eq!(actual, (0..16).collect::>()); - } - - #[test] - fn test_expand_paletted_rgb_4bit() { - let actual = expand_paletted( - &[0x01, 0x23], // src - 4, // src_bit_depth - &[ - // palette - 0, 1, 2, // entry #0 - 3, 4, 5, // entry #1 - 6, 7, 8, // entry #2 - 9, 10, 11, // entry #3 - ], - None, // trns - ); - assert_eq!(actual, (0..12).collect::>()); - } - - #[test] - fn test_expand_paletted_rgba_8bit_more_trns_entries_than_palette_entries() { - let actual = expand_paletted( - &[0, 1, 2, 3], // src - 8, // src_bit_depth - &[ - // palette - 0, 1, 2, // entry #0 - 4, 5, 6, // entry #1 - 8, 9, 10, // entry #2 - 12, 13, 14, // entry #3 - ], - Some(&[123; 5]), // trns - ); - - // Invalid (too-long) `trns` means that we'll use 0xFF / opaque alpha everywhere. - assert_eq!( - actual, - vec![0, 1, 2, 0xFF, 4, 5, 6, 0xFF, 8, 9, 10, 0xFF, 12, 13, 14, 0xFF], - ); - } - - #[test] - fn test_expand_paletted_rgba_8bit_less_trns_entries_than_palette_entries() { - let actual = expand_paletted( - &[0, 1, 2, 3], // src - 8, // src_bit_depth - &[ - // palette - 0, 1, 2, // entry #0 - 4, 5, 6, // entry #1 - 8, 9, 10, // entry #2 - 12, 13, 14, // entry #3 - ], - Some(&[3, 7]), // trns - ); - - // Too-short `trns` is treated differently from too-long - only missing entries are - // replaced with 0XFF / opaque. - assert_eq!( - actual, - vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0xFF, 12, 13, 14, 0xFF], - ); - } -} diff --git a/src/decoder/transform/palette.rs b/src/decoder/transform/palette.rs new file mode 100644 index 00000000..57d5c6a4 --- /dev/null +++ b/src/decoder/transform/palette.rs @@ -0,0 +1,195 @@ +//! Helpers for taking a slice of indeces (indices into `PLTE` and/or `trNS` +//! entries) and transforming this into RGB or RGBA output. + +use super::unpack_bits; +use crate::Info; + +pub fn expand_paletted_into_rgb8(row: &[u8], buffer: &mut [u8], info: &Info) { + let palette = info.palette.as_deref().expect("Caller should verify"); + let black = [0, 0, 0]; + + unpack_bits(row, buffer, 3, info.bit_depth as u8, |i, chunk| { + let rgb = palette + .get(3 * i as usize..3 * i as usize + 3) + .unwrap_or(&black); + chunk[0] = rgb[0]; + chunk[1] = rgb[1]; + chunk[2] = rgb[2]; + }) +} + +pub fn expand_paletted_into_rgba8(row: &[u8], buffer: &mut [u8], info: &Info) { + let palette = info.palette.as_deref().expect("Caller should verify"); + let trns = info.trns.as_deref().unwrap_or(&[]); + let black = [0, 0, 0]; + + // > The tRNS chunk shall not contain more alpha values than there are palette + // entries, but a tRNS chunk may contain fewer values than there are palette + // entries. In this case, the alpha value for all remaining palette entries is + // assumed to be 255. + // + // It seems, accepted reading is to fully *ignore* an invalid tRNS as if it were + // completely empty / all pixels are non-transparent. + let trns = if trns.len() <= palette.len() / 3 { + trns + } else { + &[] + }; + + unpack_bits(row, buffer, 4, info.bit_depth as u8, |i, chunk| { + let (rgb, a) = ( + palette + .get(3 * i as usize..3 * i as usize + 3) + .unwrap_or(&black), + *trns.get(i as usize).unwrap_or(&0xFF), + ); + chunk[0] = rgb[0]; + chunk[1] = rgb[1]; + chunk[2] = rgb[2]; + chunk[3] = a; + }); +} + +#[cfg(test)] +mod test { + use crate::{BitDepth, ColorType, Info, Transformations}; + + fn expand_paletted( + src: &[u8], + src_bit_depth: u8, + palette: &[u8], + trns: Option<&[u8]>, + ) -> Vec { + let info = Info { + color_type: ColorType::Indexed, + bit_depth: BitDepth::from_u8(src_bit_depth).unwrap(), + palette: Some(palette.into()), + trns: trns.map(Into::into), + ..Info::default() + }; + let output_bytes_per_input_sample = match trns { + None => 3, + Some(_) => 4, + }; + let samples_count_per_byte = (8 / src_bit_depth) as usize; + let samples_count = src.len() * samples_count_per_byte; + let mut dst = vec![0; samples_count * output_bytes_per_input_sample]; + let transform_fn = + super::super::create_transform_fn(&info, Transformations::EXPAND).unwrap(); + transform_fn(src, dst.as_mut_slice(), &info); + dst + } + + #[test] + fn test_expand_paletted_rgba_8bit() { + let actual = expand_paletted( + &[0, 1, 2, 3], // src + 8, // src_bit_depth + &[ + // palette + 0, 1, 2, // entry #0 + 4, 5, 6, // entry #1 + 8, 9, 10, // entry #2 + 12, 13, 14, // entry #3 + ], + Some(&[3, 7, 11, 15]), // trns + ); + assert_eq!(actual, (0..16).collect::>()); + } + + #[test] + fn test_expand_paletted_rgb_8bit() { + let actual = expand_paletted( + &[0, 1, 2, 3], // src + 8, // src_bit_depth + &[ + // palette + 0, 1, 2, // entry #0 + 3, 4, 5, // entry #1 + 6, 7, 8, // entry #2 + 9, 10, 11, // entry #3 + ], + None, // trns + ); + assert_eq!(actual, (0..12).collect::>()); + } + + #[test] + fn test_expand_paletted_rgba_4bit() { + let actual = expand_paletted( + &[0x01, 0x23], // src + 4, // src_bit_depth + &[ + // palette + 0, 1, 2, // entry #0 + 4, 5, 6, // entry #1 + 8, 9, 10, // entry #2 + 12, 13, 14, // entry #3 + ], + Some(&[3, 7, 11, 15]), // trns + ); + assert_eq!(actual, (0..16).collect::>()); + } + + #[test] + fn test_expand_paletted_rgb_4bit() { + let actual = expand_paletted( + &[0x01, 0x23], // src + 4, // src_bit_depth + &[ + // palette + 0, 1, 2, // entry #0 + 3, 4, 5, // entry #1 + 6, 7, 8, // entry #2 + 9, 10, 11, // entry #3 + ], + None, // trns + ); + assert_eq!(actual, (0..12).collect::>()); + } + + #[test] + fn test_expand_paletted_rgba_8bit_more_trns_entries_than_palette_entries() { + let actual = expand_paletted( + &[0, 1, 2, 3], // src + 8, // src_bit_depth + &[ + // palette + 0, 1, 2, // entry #0 + 4, 5, 6, // entry #1 + 8, 9, 10, // entry #2 + 12, 13, 14, // entry #3 + ], + Some(&[123; 5]), // trns + ); + + // Invalid (too-long) `trns` means that we'll use 0xFF / opaque alpha everywhere. + assert_eq!( + actual, + vec![0, 1, 2, 0xFF, 4, 5, 6, 0xFF, 8, 9, 10, 0xFF, 12, 13, 14, 0xFF], + ); + } + + #[test] + fn test_expand_paletted_rgba_8bit_less_trns_entries_than_palette_entries() { + let actual = expand_paletted( + &[0, 1, 2, 3], // src + 8, // src_bit_depth + &[ + // palette + 0, 1, 2, // entry #0 + 4, 5, 6, // entry #1 + 8, 9, 10, // entry #2 + 12, 13, 14, // entry #3 + ], + Some(&[3, 7]), // trns + ); + + // Too-short `trns` is treated differently from too-long - only missing entries are + // replaced with 0XFF / opaque. + assert_eq!( + actual, + vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0xFF, 12, 13, 14, 0xFF], + ); + } +} From 89153e67328ac7deaaa3ce520f004bccfec2b91c Mon Sep 17 00:00:00 2001 From: Lukasz Anforowicz Date: Mon, 29 Jan 2024 22:56:55 +0000 Subject: [PATCH 2/6] Fix constants used in palette benchmarks. The `PLTE` chunk's size should be a multiple of 3 (since it contains RGB entries - 3 bytes per entry). Additionally, taking 10000 samples in the `bench_create_fn` benchmarks is a bit excessive after memoization. --- benches/expand_paletted.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benches/expand_paletted.rs b/benches/expand_paletted.rs index 06294368..caf17239 100644 --- a/benches/expand_paletted.rs +++ b/benches/expand_paletted.rs @@ -114,10 +114,10 @@ fn create_expand_palette_fn(info: &Info) -> TransformFn { fn bench_create_fn(c: &mut Criterion, plte_size: usize, trns_size: usize) { let mut group = c.benchmark_group("expand_paletted(ctor)"); - group.sample_size(10000); + group.sample_size(1000); let mut rng = rand::thread_rng(); - let plte = get_random_bytes(&mut rng, plte_size as usize); + let plte = get_random_bytes(&mut rng, 3 * plte_size as usize); let trns = get_random_bytes(&mut rng, trns_size as usize); let info = create_info_from_plte_trns_bitdepth(&plte, Some(&trns), 8); group.bench_with_input( From 6b17e138d1f3c05b9653318ddbbea4a0ca980595 Mon Sep 17 00:00:00 2001 From: Lukasz Anforowicz Date: Mon, 29 Jan 2024 19:11:19 +0000 Subject: [PATCH 3/6] Change `TransformFn` to allow memoization in the future This commit changes the `TransformFn` type alias from `fn(...)` into `Box`. This allows the `TransformFn` to have store some precomputer, memoized state that we plan to add in follow-up commits. In theory this commit may have negative performance impact, but in the grand scheme of things it disappears into the measurement noise. In particular, when there is no state, then `Box` shouldn't allocate. --- src/decoder/mod.rs | 2 +- src/decoder/transform.rs | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/decoder/mod.rs b/src/decoder/mod.rs index 21d51a66..676e04f3 100644 --- a/src/decoder/mod.rs +++ b/src/decoder/mod.rs @@ -638,7 +638,7 @@ impl Reader { if self.transform_fn.is_none() { self.transform_fn = Some(create_transform_fn(self.info(), self.transform)?); } - self.transform_fn.unwrap() + self.transform_fn.as_deref().unwrap() }; transform_fn(row, output_buffer, self.info()); diff --git a/src/decoder/transform.rs b/src/decoder/transform.rs index cc89ac99..43da0dc3 100644 --- a/src/decoder/transform.rs +++ b/src/decoder/transform.rs @@ -12,7 +12,7 @@ use super::stream::FormatErrorInner; /// /// TODO: If some precomputed state is needed (e.g. to make `expand_paletted...` /// faster) then consider changing this into `Box`. -pub type TransformFn = fn(&[u8], &mut [u8], &Info); +pub type TransformFn = Box; /// Returns a transformation function that should be applied to image rows based /// on 1) decoded image metadata (`info`) and 2) the transformations requested @@ -43,36 +43,36 @@ pub fn create_transform_fn( .into(), )); } else { - if trns { - Ok(palette::expand_paletted_into_rgba8) + Ok(Box::new(if trns { + palette::expand_paletted_into_rgba8 } else { - Ok(palette::expand_paletted_into_rgb8) - } + palette::expand_paletted_into_rgb8 + })) } } ColorType::Grayscale | ColorType::GrayscaleAlpha if bit_depth < 8 && expand => { - if trns { - Ok(expand_gray_u8_with_trns) + Ok(Box::new(if trns { + expand_gray_u8_with_trns } else { - Ok(expand_gray_u8) - } + expand_gray_u8 + })) } ColorType::Grayscale | ColorType::Rgb if expand && trns => { - if bit_depth == 8 { - Ok(expand_trns_line) + Ok(Box::new(if bit_depth == 8 { + expand_trns_line } else if strip16 { - Ok(expand_trns_and_strip_line16) + expand_trns_and_strip_line16 } else { assert_eq!(bit_depth, 16); - Ok(expand_trns_line16) - } + expand_trns_line16 + })) } ColorType::Grayscale | ColorType::GrayscaleAlpha | ColorType::Rgb | ColorType::Rgba if strip16 => { - Ok(transform_row_strip16) + Ok(Box::new(transform_row_strip16)) } - _ => Ok(copy_row), + _ => Ok(Box::new(copy_row)), } } From d24e9b3ea87c158193c2e466beda01ed683a6a98 Mon Sep 17 00:00:00 2001 From: Lukasz Anforowicz Date: Mon, 29 Jan 2024 19:33:38 +0000 Subject: [PATCH 4/6] Memoize combined PLTE+trNS lookup table. Before this commit `expand_paletted_into_rgba8` would: * Perform 2 lookups - `palette.get(i)` and `trns.get(i)` * Check via `unwrap_or` if `i` was within the bounds of `palette`/`trns` This commit introduces `create_rgba_palette` which combines `palette` and `trns` into a fixed-size `[[u8;4]; 256]` look-up table (called `rgba_palette` in the code). After this commit `expand_paletted_into_rgba8` only needs to perform a single look-up and doesn't need to check the bounds. This helps to improve the expansion time by 60+%: - expand_paletted(exec)/trns=yes/src_bits=4/src_size=5461: [-60.208% -60.057% -59.899%] (p = 0.00 < 0.05) - expand_paletted(exec)/trns=yes/src_bits=8/src_size=5461: [-77.520% -77.407% -77.301%] (p = 0.00 < 0.05) `expand_paletted_into_rgb8` performs only a single lookup before and after this commit, but avoiding bounds checks still helps to improve the expansion time by ~12%: - expand_paletted(exec)/trns=no/src_bits=4/src_size=5461: [-12.357% -12.005% -11.664%] (p = 0.00 < 0.05) - expand_paletted(exec)/trns=no/src_bits=8/src_size=5461: [-13.135% -12.584% -12.092%] (p = 0.00 < 0.05) Understandably, this commit regresses the time of `create_transform_fn`. Future commits will reduce this regression 2-4 times: - expand_paletted(ctor)/plte=256/trns=256: [+3757.2% +3763.8% +3770.5%] (p = 0.00 < 0.05) - expand_paletted(ctor)/plte=224/trns=32: [+3807.3% +3816.2% +3824.6%] (p = 0.00 < 0.05) - expand_paletted(ctor)/plte=16/trns=1: [+1672.0% +1675.0% +1678.1%] (p = 0.00 < 0.05) --- src/decoder/transform.rs | 8 +- src/decoder/transform/palette.rs | 192 ++++++++++++++++++++++++++----- 2 files changed, 166 insertions(+), 34 deletions(-) diff --git a/src/decoder/transform.rs b/src/decoder/transform.rs index 43da0dc3..7271bcb9 100644 --- a/src/decoder/transform.rs +++ b/src/decoder/transform.rs @@ -43,11 +43,11 @@ pub fn create_transform_fn( .into(), )); } else { - Ok(Box::new(if trns { - palette::expand_paletted_into_rgba8 + Ok(if trns { + palette::create_expansion_into_rgba8(info) } else { - palette::expand_paletted_into_rgb8 - })) + palette::create_expansion_into_rgb8(info) + }) } } ColorType::Grayscale | ColorType::GrayscaleAlpha if bit_depth < 8 && expand => { diff --git a/src/decoder/transform/palette.rs b/src/decoder/transform/palette.rs index 57d5c6a4..f9bf5f86 100644 --- a/src/decoder/transform/palette.rs +++ b/src/decoder/transform/palette.rs @@ -1,27 +1,41 @@ //! Helpers for taking a slice of indeces (indices into `PLTE` and/or `trNS` //! entries) and transforming this into RGB or RGBA output. +//! +//! # Memoization +//! +//! To achieve higher throughput, `create_rgba_palette` combines entries from +//! `PLTE` and `trNS` chunks into a single lookup table. This is based on the +//! ideas explored in https://crbug.com/706134. +//! +//! Memoization is a trade-off: +//! * On one hand, memoization requires spending X ns before starting to call +//! `expand_paletted_...` functions. +//! * On the other hand, memoization improves the throughput of the +//! `expand_paletted_...` functions - they take Y ns less to process each byte +//! +//! Based on X and Y, we can try to calculate the breakeven point. It seems +//! that memoization is a net benefit for images bigger than around 13x13 pixels. -use super::unpack_bits; +use super::{unpack_bits, TransformFn}; use crate::Info; -pub fn expand_paletted_into_rgb8(row: &[u8], buffer: &mut [u8], info: &Info) { - let palette = info.palette.as_deref().expect("Caller should verify"); - let black = [0, 0, 0]; +pub fn create_expansion_into_rgb8(info: &Info) -> TransformFn { + let rgba_palette = create_rgba_palette(info); + Box::new(move |input, output, info| { + expand_paletted_into_rgb8(input, output, info, &rgba_palette) + }) +} - unpack_bits(row, buffer, 3, info.bit_depth as u8, |i, chunk| { - let rgb = palette - .get(3 * i as usize..3 * i as usize + 3) - .unwrap_or(&black); - chunk[0] = rgb[0]; - chunk[1] = rgb[1]; - chunk[2] = rgb[2]; +pub fn create_expansion_into_rgba8(info: &Info) -> TransformFn { + let rgba_palette = create_rgba_palette(info); + Box::new(move |input, output, info| { + expand_paletted_into_rgba8(input, output, info, &rgba_palette) }) } -pub fn expand_paletted_into_rgba8(row: &[u8], buffer: &mut [u8], info: &Info) { +fn create_rgba_palette(info: &Info) -> [[u8; 4]; 256] { let palette = info.palette.as_deref().expect("Caller should verify"); let trns = info.trns.as_deref().unwrap_or(&[]); - let black = [0, 0, 0]; // > The tRNS chunk shall not contain more alpha values than there are palette // entries, but a tRNS chunk may contain fewer values than there are palette @@ -36,17 +50,44 @@ pub fn expand_paletted_into_rgba8(row: &[u8], buffer: &mut [u8], info: &Info) { &[] }; + // Default to black, opaque entries. + let mut rgba_palette = [[0, 0, 0, 0xFF]; 256]; + + // Replace missing `trns` entry with 100%-opaque alpha. + let trns = trns.iter().copied().chain(std::iter::repeat(0xFF)); + + // Combine `palette` and `trns` into a single lookup table: `rgba_palette`. + let rgba_and_alpha_iter = palette.chunks_exact(3).zip(trns); + for (rgba, (rgb, alpha)) in rgba_palette.iter_mut().zip(rgba_and_alpha_iter) { + rgba[0..3].copy_from_slice(rgb); + rgba[3] = alpha; + } + + rgba_palette +} + +fn expand_paletted_into_rgb8( + row: &[u8], + buffer: &mut [u8], + info: &Info, + rgba_palette: &[[u8; 4]; 256], +) { + unpack_bits(row, buffer, 3, info.bit_depth as u8, |i, chunk| { + let rgba = &rgba_palette[i as usize]; + chunk[0] = rgba[0]; + chunk[1] = rgba[1]; + chunk[2] = rgba[2]; + }) +} + +fn expand_paletted_into_rgba8( + row: &[u8], + buffer: &mut [u8], + info: &Info, + rgba_palette: &[[u8; 4]; 256], +) { unpack_bits(row, buffer, 4, info.bit_depth as u8, |i, chunk| { - let (rgb, a) = ( - palette - .get(3 * i as usize..3 * i as usize + 3) - .unwrap_or(&black), - *trns.get(i as usize).unwrap_or(&0xFF), - ); - chunk[0] = rgb[0]; - chunk[1] = rgb[1]; - chunk[2] = rgb[2]; - chunk[3] = a; + chunk.copy_from_slice(&rgba_palette[i as usize]); }); } @@ -54,29 +95,94 @@ pub fn expand_paletted_into_rgba8(row: &[u8], buffer: &mut [u8], info: &Info) { mod test { use crate::{BitDepth, ColorType, Info, Transformations}; + /// Old, non-memoized version of the code is used as a test oracle. + fn oracle_expand_paletted_into_rgb8(row: &[u8], buffer: &mut [u8], info: &Info) { + let palette = info.palette.as_deref().expect("Caller should verify"); + let black = [0, 0, 0]; + + super::unpack_bits(row, buffer, 3, info.bit_depth as u8, |i, chunk| { + let rgb = palette + .get(3 * i as usize..3 * i as usize + 3) + .unwrap_or(&black); + chunk[0] = rgb[0]; + chunk[1] = rgb[1]; + chunk[2] = rgb[2]; + }) + } + + /// Old, non-memoized version of the code is used as a test oracle. + fn oracle_expand_paletted_into_rgba8(row: &[u8], buffer: &mut [u8], info: &Info) { + let palette = info.palette.as_deref().expect("Caller should verify"); + let trns = info.trns.as_deref().unwrap_or(&[]); + let black = [0, 0, 0]; + + // > The tRNS chunk shall not contain more alpha values than there are palette + // entries, but a tRNS chunk may contain fewer values than there are palette + // entries. In this case, the alpha value for all remaining palette entries is + // assumed to be 255. + // + // It seems, accepted reading is to fully *ignore* an invalid tRNS as if it were + // completely empty / all pixels are non-transparent. + let trns = if trns.len() <= palette.len() / 3 { + trns + } else { + &[] + }; + + super::unpack_bits(row, buffer, 4, info.bit_depth as u8, |i, chunk| { + let (rgb, a) = ( + palette + .get(3 * i as usize..3 * i as usize + 3) + .unwrap_or(&black), + *trns.get(i as usize).unwrap_or(&0xFF), + ); + chunk[0] = rgb[0]; + chunk[1] = rgb[1]; + chunk[2] = rgb[2]; + chunk[3] = a; + }); + } + + fn create_info<'a>(src_bit_depth: u8, palette: &'a [u8], trns: Option<&'a [u8]>) -> Info<'a> { + Info { + color_type: ColorType::Indexed, + bit_depth: BitDepth::from_u8(src_bit_depth).unwrap(), + palette: Some(palette.into()), + trns: trns.map(Into::into), + ..Info::default() + } + } + fn expand_paletted( src: &[u8], src_bit_depth: u8, palette: &[u8], trns: Option<&[u8]>, ) -> Vec { - let info = Info { - color_type: ColorType::Indexed, - bit_depth: BitDepth::from_u8(src_bit_depth).unwrap(), - palette: Some(palette.into()), - trns: trns.map(Into::into), - ..Info::default() - }; + let info = create_info(src_bit_depth, palette, trns); let output_bytes_per_input_sample = match trns { None => 3, Some(_) => 4, }; let samples_count_per_byte = (8 / src_bit_depth) as usize; let samples_count = src.len() * samples_count_per_byte; + let mut dst = vec![0; samples_count * output_bytes_per_input_sample]; let transform_fn = super::super::create_transform_fn(&info, Transformations::EXPAND).unwrap(); transform_fn(src, dst.as_mut_slice(), &info); + + { + // Compare the memoization-based calculations with the old, non-memoized code. + let mut simple_dst = vec![0; samples_count * output_bytes_per_input_sample]; + if trns.is_none() { + oracle_expand_paletted_into_rgb8(src, &mut simple_dst, &info) + } else { + oracle_expand_paletted_into_rgba8(src, &mut simple_dst, &info) + } + assert_eq!(&dst, &simple_dst); + } + dst } @@ -192,4 +298,30 @@ mod test { vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0xFF, 12, 13, 14, 0xFF], ); } + + #[test] + fn test_create_rgba_palette() { + fn create_expected_rgba_palette(plte: &[u8], trns: &[u8]) -> [[u8; 4]; 256] { + let mut rgba = [[1, 2, 3, 4]; 256]; + for (i, rgba) in rgba.iter_mut().enumerate() { + rgba[0] = plte.get(i * 3 + 0).map(|&r| r).unwrap_or(0); + rgba[1] = plte.get(i * 3 + 1).map(|&g| g).unwrap_or(0); + rgba[2] = plte.get(i * 3 + 2).map(|&b| b).unwrap_or(0); + rgba[3] = trns.get(i * 1 + 0).map(|&a| a).unwrap_or(0xFF); + } + rgba + } + + for plte_len in 1..=32 { + for trns_len in 0..=plte_len { + let plte: Vec = (0..plte_len * 3).collect(); + let trns: Vec = (0..trns_len).map(|alpha| alpha + 200).collect(); + + let info = create_info(8, &plte, Some(&trns)); + let expected = create_expected_rgba_palette(&plte, &trns); + let actual = super::create_rgba_palette(&info); + assert_eq!(actual, expected); + } + } + } } From fbd33dfb702548929ddc2172700581769bfa07e4 Mon Sep 17 00:00:00 2001 From: Lukasz Anforowicz Date: Mon, 29 Jan 2024 22:00:26 +0000 Subject: [PATCH 5/6] Copy 4 bytes at a time when expanding palette into rgb8. Before this commit `expand_into_rgb8` would copy 3 bytes at a time into the output. After this commit it copies 4 bytes at a time (possibly cloberring pixels that will be populated during the next iteration - this is ok). This improved the performance as follows: expand_paletted(exec)/trns=no/src_bits=8/src_size=5461 time: [-23.852% -23.593% -23.319%] (p = 0.00 < 0.05) --- src/decoder/transform/palette.rs | 33 ++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/decoder/transform/palette.rs b/src/decoder/transform/palette.rs index f9bf5f86..2fdc233c 100644 --- a/src/decoder/transform/palette.rs +++ b/src/decoder/transform/palette.rs @@ -17,13 +17,16 @@ //! that memoization is a net benefit for images bigger than around 13x13 pixels. use super::{unpack_bits, TransformFn}; -use crate::Info; +use crate::{BitDepth, Info}; pub fn create_expansion_into_rgb8(info: &Info) -> TransformFn { let rgba_palette = create_rgba_palette(info); - Box::new(move |input, output, info| { - expand_paletted_into_rgb8(input, output, info, &rgba_palette) - }) + + if info.bit_depth == BitDepth::Eight { + Box::new(move |input, output, _info| expand_8bit_into_rgb8(input, output, &rgba_palette)) + } else { + Box::new(move |input, output, info| expand_into_rgb8(input, output, info, &rgba_palette)) + } } pub fn create_expansion_into_rgba8(info: &Info) -> TransformFn { @@ -66,12 +69,22 @@ fn create_rgba_palette(info: &Info) -> [[u8; 4]; 256] { rgba_palette } -fn expand_paletted_into_rgb8( - row: &[u8], - buffer: &mut [u8], - info: &Info, - rgba_palette: &[[u8; 4]; 256], -) { +fn expand_8bit_into_rgb8(mut input: &[u8], mut output: &mut [u8], rgba_palette: &[[u8; 4]; 256]) { + while output.len() >= 4 { + // Copying 4 bytes at a time is more efficient than 3. + let rgba = &rgba_palette[input[0] as usize]; + output[0..4].copy_from_slice(rgba); + + input = &input[1..]; + output = &mut output[3..]; + } + if output.len() > 0 { + let rgba = &rgba_palette[input[0] as usize]; + output[0..3].copy_from_slice(&rgba[0..3]); + } +} + +fn expand_into_rgb8(row: &[u8], buffer: &mut [u8], info: &Info, rgba_palette: &[[u8; 4]; 256]) { unpack_bits(row, buffer, 3, info.bit_depth as u8, |i, chunk| { let rgba = &rgba_palette[i as usize]; chunk[0] = rgba[0]; From 57b5d540df2fba25b72da625aae4821b66b77c60 Mon Sep 17 00:00:00 2001 From: Lukasz Anforowicz Date: Mon, 29 Jan 2024 22:09:55 +0000 Subject: [PATCH 6/6] Copy 4 bytes at a time in `create_rgba_palette` This improves the performance as follows: - expand_paletted(ctor)/plte=256/trns=256 [-40.581% -40.396% -40.211%] (p = 0.00 < 0.05) - expand_paletted(ctor)/plte=224/trns=32 [-24.070% -23.840% -23.592%] (p = 0.00 < 0.05) Small palettes are mostly unaffected: - expand_paletted(ctor)/plte=16/trns=1 [-0.2525% +0.0338% +0.3239%] (p = 0.81 > 0.05) --- src/decoder/transform/palette.rs | 33 ++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/decoder/transform/palette.rs b/src/decoder/transform/palette.rs index 2fdc233c..72dc88f1 100644 --- a/src/decoder/transform/palette.rs +++ b/src/decoder/transform/palette.rs @@ -56,16 +56,37 @@ fn create_rgba_palette(info: &Info) -> [[u8; 4]; 256] { // Default to black, opaque entries. let mut rgba_palette = [[0, 0, 0, 0xFF]; 256]; - // Replace missing `trns` entry with 100%-opaque alpha. - let trns = trns.iter().copied().chain(std::iter::repeat(0xFF)); + // Copy `palette` (RGB) entries into `rgba_palette`. This may clobber alpha + // values in `rgba_palette` - we need to fix this later. + { + let mut palette_iter = palette; + let mut rgba_iter = &mut rgba_palette[..]; + while palette_iter.len() >= 4 { + // Copying 4 bytes at a time is more efficient than copying 3. + // OTOH, this clobbers the alpha value in `rgba_iter[0][3]` - we + // need to fix this later. + rgba_iter[0].copy_from_slice(&palette_iter[0..4]); + + palette_iter = &palette_iter[3..]; + rgba_iter = &mut rgba_iter[1..]; + } + if palette_iter.len() > 0 { + rgba_iter[0][0..3].copy_from_slice(&palette_iter[0..3]); + } + } - // Combine `palette` and `trns` into a single lookup table: `rgba_palette`. - let rgba_and_alpha_iter = palette.chunks_exact(3).zip(trns); - for (rgba, (rgb, alpha)) in rgba_palette.iter_mut().zip(rgba_and_alpha_iter) { - rgba[0..3].copy_from_slice(rgb); + // Copy `trns` (alpha) entries into `rgba_palette`. `trns.len()` may be + // smaller than `palette.len()` and therefore this is not sufficient to fix + // all the clobbered alpha values. + for (alpha, rgba) in trns.iter().copied().zip(rgba_palette.iter_mut()) { rgba[3] = alpha; } + // Unclobber the remaining alpha values. + for rgba in rgba_palette[trns.len()..(palette.len() / 3)].iter_mut() { + rgba[3] = 0xFF; + } + rgba_palette }