diff --git a/autotest/cpp/test_gdal.cpp b/autotest/cpp/test_gdal.cpp
index 7cf821258994..7ae64a189bad 100644
--- a/autotest/cpp/test_gdal.cpp
+++ b/autotest/cpp/test_gdal.cpp
@@ -5039,7 +5039,21 @@ TEST_F(test_gdal, GDALTranspose2D_Byte_optims)
         }
     }
 
-    // SSSE3 optim H = 3
+    // SSSE3 optim H = 3 with W < 16
+    {
+        constexpr int W = 15;
+        constexpr int H = 3;
+        GDALTranspose2D(in.data(), GDT_Byte, out.data(), GDT_Byte, W, H);
+        for (int y = 0; y < H; ++y)
+        {
+            for (int x = 0; x < W; ++x)
+            {
+                EXPECT_EQ(out[x * H + y], in[y * W + x]);
+            }
+        }
+    }
+
+    // SSSE3 optim H = 3 with W >= 16
     {
         constexpr int W = 19;
         constexpr int H = 3;
diff --git a/gcore/rasterio_ssse3.cpp b/gcore/rasterio_ssse3.cpp
index b09fb4726fc8..9bf80e34096a 100644
--- a/gcore/rasterio_ssse3.cpp
+++ b/gcore/rasterio_ssse3.cpp
@@ -272,10 +272,148 @@ void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
 }
 #endif
 
+/************************************************************************/
+/*                               loadu()                                */
+/************************************************************************/
+
+inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride)
+{
+    return _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(pSrc + i * srcStride));
+}
+
+/************************************************************************/
+/*                               storeu()                               */
+/************************************************************************/
+
+inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg)
+{
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg);
+}
+
 /************************************************************************/
 /*                      GDALInterleave3Byte_SSSE3()                     */
 /************************************************************************/
 
+#if (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
+
+inline __m128i GDAL_mm_or_3_si128(__m128i r0, __m128i r1, __m128i r2)
+{
+    return _mm_or_si128(_mm_or_si128(r0, r1), r2);
+}
+
+// ICC autovectorizer doesn't do a good job at generating good SSE code,
+// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
+                          uint8_t *CPL_RESTRICT pDst, size_t nIters)
+{
+    size_t i = 0;
+    constexpr size_t VALS_PER_ITER = 16;
+
+    if (nIters >= VALS_PER_ITER)
+    {
+        // clang-format off
+        constexpr char X = -1;
+        // How to dispatch 16 values of row=0 onto 3x16 bytes
+        const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X,
+                                                    1, X, X,
+                                                    2, X, X,
+                                                    3, X, X,
+                                                    4, X, X,
+                                                    5);
+        const __m128i xmm_shuffle01 = _mm_setr_epi8(   X, X,
+                                                    6, X, X,
+                                                    7, X, X,
+                                                    8, X, X,
+                                                    9, X, X,
+                                                    10,X);
+        const __m128i xmm_shuffle02 = _mm_setr_epi8(       X,
+                                                    11, X, X,
+                                                    12, X, X,
+                                                    13, X, X,
+                                                    14, X, X,
+                                                    15, X, X);
+
+        // How to dispatch 16 values of row=1 onto 3x16 bytes
+        const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X,
+                                                    X, 1, X,
+                                                    X, 2, X,
+                                                    X, 3, X,
+                                                    X, 4, X,
+                                                    X);
+        const __m128i xmm_shuffle11 = _mm_setr_epi8(   5, X,
+                                                    X, 6, X,
+                                                    X, 7, X,
+                                                    X, 8, X,
+                                                    X, 9, X,
+                                                    X,10);
+        const __m128i xmm_shuffle12 = _mm_setr_epi8(       X,
+                                                    X, 11, X,
+                                                    X, 12, X,
+                                                    X, 13, X,
+                                                    X, 14, X,
+                                                    X, 15, X);
+
+        // How to dispatch 16 values of row=2 onto 3x16 bytes
+        const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0,
+                                                    X, X, 1,
+                                                    X, X, 2,
+                                                    X, X, 3,
+                                                    X, X, 4,
+                                                    X);
+        const __m128i xmm_shuffle21 = _mm_setr_epi8(   X, 5,
+                                                    X, X, 6,
+                                                    X, X, 7,
+                                                    X, X, 8,
+                                                    X, X, 9,
+                                                    X, X);
+        const __m128i xmm_shuffle22 = _mm_setr_epi8(      10,
+                                                    X, X, 11,
+                                                    X, X, 12,
+                                                    X, X, 13,
+                                                    X, X, 14,
+                                                    X, X, 15);
+        // clang-format on
+
+        for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
+        {
+#define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
+            LOAD(0);
+            LOAD(1);
+            LOAD(2);
+
+#define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
+#define COMBINE_3(x)                                                           \
+    GDAL_mm_or_3_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2))
+
+#define STORE(x)                                                               \
+    storeu(pDst, 3 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_3(x))
+            STORE(0);
+            STORE(1);
+            STORE(2);
+#undef LOAD
+#undef COMBINE_3
+#undef SHUFFLE
+#undef STORE
+        }
+    }
+
+    for (; i < nIters; ++i)
+    {
+#define INTERLEAVE(x) pDst[3 * i + x] = pSrc[i + x * nIters]
+        INTERLEAVE(0);
+        INTERLEAVE(1);
+        INTERLEAVE(2);
+#undef INTERLEAVE
+    }
+}
+
+#else
+
 #if defined(__GNUC__) && !defined(__clang__)
 __attribute__((optimize("tree-vectorize")))
 #endif
@@ -297,21 +435,12 @@ GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
     }
 }
 
+#endif
+
 /************************************************************************/
 /*                      GDALInterleave5Byte_SSSE3()                     */
 /************************************************************************/
 
-inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride)
-{
-    return _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(pSrc + i * srcStride));
-}
-
-inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg)
-{
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg);
-}
-
 inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2,
                                   __m128i r3, __m128i r4)
 {