From 3c3f7151bdafd97efc845c93f2ee7a6c543f9fd3 Mon Sep 17 00:00:00 2001 From: "Vlad (Kuzmin) Erium" Date: Thu, 7 Nov 2024 04:24:35 +0900 Subject: [PATCH] perf: IBA::unsharp_mask() speed and memory optimization (#4513) Replacing 3x IBA + Helper function that generate 4 fulls size image buffers with single unsharp_mask_impl() that use parallel_image() to compute unsharp: src + contr * (((src - blur) < threshold) ? 0.0 : (src - blur)) Added two pass 1D convolution for a kernels higher than 3x3 ## Tests ``` ImageBuf sharped(input.spec()); const int repeats = 50; std::cout << "Start sharpening\n"; auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < repeats; i++) { //ok = ImageBufAlgo::unsharp_mask(sharped, input, "gaussian", 15.0f, 10.0f, 0.01f); ok = ImageBufAlgo::unsharp_mask(sharped, input, "gaussian", 5.0f, 2.0f, 0.05f); std::cout << "."; } std::cout << "\n"; auto part1 = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed_part1 = part1 - start; std::cout << "Elapsed time: " << elapsed_part1.count() << " s\n"; ``` both single threaded (one IB at time) and multithreaded (multiply IB at time) show pretty good speedup: ~30-40% with less memory use. for 5x5 gaussian kernels two pass mode should add at least 20% speedup. (if someone can do independent benchmark, will be great. As soon as I had a big differences on them depend on real or synthetic use) --------- Signed-off-by: Vlad (Kuzmin) Erium --- src/libOpenImageIO/imagebufalgo.cpp | 64 +++++++++++-------- testsuite/docs-examples-cpp/ref/out-arm.txt | 2 +- testsuite/docs-examples-cpp/ref/out.txt | 2 +- .../docs-examples-python/ref/out-arm.txt | 2 +- testsuite/docs-examples-python/ref/out.txt | 2 +- 5 files changed, 42 insertions(+), 30 deletions(-) diff --git a/src/libOpenImageIO/imagebufalgo.cpp b/src/libOpenImageIO/imagebufalgo.cpp index c3f3c50b92..483327093d 100644 --- a/src/libOpenImageIO/imagebufalgo.cpp +++ b/src/libOpenImageIO/imagebufalgo.cpp @@ -947,17 +947,28 @@ ImageBufAlgo::make_kernel(string_view name, float width, float height, -// Helper function for unsharp mask to perform the thresholding +template static bool -threshold_to_zero(ImageBuf& dst, float threshold, ROI roi, int nthreads) +unsharp_impl(ImageBuf& dst, const ImageBuf& blr, const ImageBuf& src, + const float contrast, const float threshold, ROI roi, int nthreads) { - OIIO_DASSERT(dst.spec().format.basetype == TypeDesc::FLOAT); + OIIO_DASSERT(dst.spec().nchannels == src.spec().nchannels + && dst.spec().nchannels == blr.spec().nchannels); ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) { - for (ImageBuf::Iterator p(dst, roi); !p.done(); ++p) - for (int c = roi.chbegin; c < roi.chend; ++c) - if (fabsf(p[c]) < threshold) - p[c] = 0.0f; + ImageBuf::ConstIterator s(src, roi); + ImageBuf::ConstIterator b(blr, roi); + for (ImageBuf::Iterator d(dst, roi); !d.done(); ++s, ++d, ++b) { + for (int c = roi.chbegin; c < roi.chend; ++c) { + const float diff = s[c] - b[c]; + const float abs_diff = fabsf(diff); + if (abs_diff > threshold) { + d[c] = s[c] + contrast * diff; + } else { + d[c] = s[c]; + } + } + } }); return true; } @@ -977,10 +988,26 @@ ImageBufAlgo::unsharp_mask(ImageBuf& dst, const ImageBuf& src, // Blur the source image, store in Blurry ImageSpec BlurrySpec = src.spec(); BlurrySpec.set_format(TypeDesc::FLOAT); // force float + ImageBuf fst_pass(BlurrySpec); ImageBuf Blurry(BlurrySpec); if (kernel == "median") { median_filter(Blurry, src, ceilf(width), 0, roi, nthreads); + } else if (width > 3.0) { + ImageBuf K = make_kernel(kernel, 1, width); + ImageBuf Kt = ImageBufAlgo::transpose(K); + if (K.has_error()) { + dst.errorfmt("{}", K.geterror()); + return false; + } + if (!convolve(fst_pass, src, K, true, roi, nthreads)) { + dst.errorfmt("{}", fst_pass.geterror()); + return false; + } + if (!convolve(Blurry, fst_pass, Kt, true, roi, nthreads)) { + dst.errorfmt("{}", Blurry.geterror()); + return false; + } } else { ImageBuf K = make_kernel(kernel, width, width); if (K.has_error()) { @@ -993,25 +1020,10 @@ ImageBufAlgo::unsharp_mask(ImageBuf& dst, const ImageBuf& src, } } - // Compute the difference between the source image and the blurry - // version. (We store it in the same buffer we used for the difference - // image.) - ImageBuf& Diff(Blurry); - bool ok = sub(Diff, src, Blurry, roi, nthreads); - - if (ok && threshold > 0.0f) - ok = threshold_to_zero(Diff, threshold, roi, nthreads); - - // Scale the difference image by the contrast - if (ok) - ok = mul(Diff, Diff, contrast, roi, nthreads); - if (!ok) { - dst.errorfmt("{}", Diff.geterror()); - return false; - } - - // Add the scaled difference to the original, to get the final answer - ok = add(dst, src, Diff, roi, nthreads); + bool ok; + OIIO_DISPATCH_COMMON_TYPES(ok, "unsharp_mask", unsharp_impl, + dst.spec().format, dst, Blurry, src, contrast, + threshold, roi, nthreads); return ok; } diff --git a/testsuite/docs-examples-cpp/ref/out-arm.txt b/testsuite/docs-examples-cpp/ref/out-arm.txt index 4335fb8bef..0fae4c9b02 100644 --- a/testsuite/docs-examples-cpp/ref/out-arm.txt +++ b/testsuite/docs-examples-cpp/ref/out-arm.txt @@ -137,7 +137,7 @@ checker_with_alpha_filled.exr : 256 x 256, 4 channel, half openexr tahoe_median_filter.tif : 512 x 384, 3 channel, uint8 tiff SHA-1: A0B2E3A10A16EA8CC905F144C5F91B6A0964A177 tahoe_unsharp_mask.tif : 512 x 384, 3 channel, uint8 tiff - SHA-1: CDE3FAC8053381C59B7BEB3B47991F357E14D9D2 + SHA-1: 5842D16483BC74700DE9FD27967B2FFBD54DFCD2 Comparing "simple.tif" and "ref/simple.tif" PASS Comparing "scanlines.tif" and "ref/scanlines.tif" diff --git a/testsuite/docs-examples-cpp/ref/out.txt b/testsuite/docs-examples-cpp/ref/out.txt index 83d709a796..9805862c13 100644 --- a/testsuite/docs-examples-cpp/ref/out.txt +++ b/testsuite/docs-examples-cpp/ref/out.txt @@ -137,7 +137,7 @@ checker_with_alpha_filled.exr : 256 x 256, 4 channel, half openexr tahoe_median_filter.tif : 512 x 384, 3 channel, uint8 tiff SHA-1: A0B2E3A10A16EA8CC905F144C5F91B6A0964A177 tahoe_unsharp_mask.tif : 512 x 384, 3 channel, uint8 tiff - SHA-1: D3B56074F48EC5D3ADDA4BDE1F487192ABE9BA76 + SHA-1: C1C9C843D45D90B7C0BBD7BCDB7A11814668FC6D Comparing "simple.tif" and "ref/simple.tif" PASS Comparing "scanlines.tif" and "ref/scanlines.tif" diff --git a/testsuite/docs-examples-python/ref/out-arm.txt b/testsuite/docs-examples-python/ref/out-arm.txt index f588e033a9..d2df876a60 100644 --- a/testsuite/docs-examples-python/ref/out-arm.txt +++ b/testsuite/docs-examples-python/ref/out-arm.txt @@ -137,7 +137,7 @@ checker_with_alpha_filled.exr : 256 x 256, 4 channel, half openexr tahoe_median_filter.tif : 512 x 384, 3 channel, uint8 tiff SHA-1: A0B2E3A10A16EA8CC905F144C5F91B6A0964A177 tahoe_unsharp_mask.tif : 512 x 384, 3 channel, uint8 tiff - SHA-1: CDE3FAC8053381C59B7BEB3B47991F357E14D9D2 + SHA-1: 5842D16483BC74700DE9FD27967B2FFBD54DFCD2 Comparing "simple.tif" and "../docs-examples-cpp/ref/simple.tif" PASS Comparing "scanlines.tif" and "../docs-examples-cpp/ref/scanlines.tif" diff --git a/testsuite/docs-examples-python/ref/out.txt b/testsuite/docs-examples-python/ref/out.txt index 835801f3dd..0b8cdebdb9 100644 --- a/testsuite/docs-examples-python/ref/out.txt +++ b/testsuite/docs-examples-python/ref/out.txt @@ -137,7 +137,7 @@ checker_with_alpha_filled.exr : 256 x 256, 4 channel, half openexr tahoe_median_filter.tif : 512 x 384, 3 channel, uint8 tiff SHA-1: A0B2E3A10A16EA8CC905F144C5F91B6A0964A177 tahoe_unsharp_mask.tif : 512 x 384, 3 channel, uint8 tiff - SHA-1: D3B56074F48EC5D3ADDA4BDE1F487192ABE9BA76 + SHA-1: C1C9C843D45D90B7C0BBD7BCDB7A11814668FC6D Comparing "simple.tif" and "../docs-examples-cpp/ref/simple.tif" PASS Comparing "scanlines.tif" and "../docs-examples-cpp/ref/scanlines.tif"