diff --git a/library/bignum.c b/library/bignum.c
index 288f85932984..6f634b5d1717 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -38,6 +38,7 @@
 #if defined(MBEDTLS_BIGNUM_C)
 
 #include "mbedtls/bignum.h"
+#include "bignum_internal.h"
 #include "bn_mul.h"
 #include "mbedtls/platform_util.h"
 #include "mbedtls/error.h"
@@ -1369,53 +1370,29 @@ int mbedtls_mpi_sub_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_sint
     return( mbedtls_mpi_sub_mpi( X, A, &B ) );
 }
 
-/** Helper for mbedtls_mpi multiplication.
- *
- * Add \p b * \p s to \p d.
- *
- * \param i             The number of limbs of \p s.
- * \param[in] s         A bignum to multiply, of size \p i.
- *                      It may overlap with \p d, but only if
- *                      \p d <= \p s.
- *                      Its leading limb must not be \c 0.
- * \param[in,out] d     The bignum to add to.
- *                      It must be sufficiently large to store the
- *                      result of the multiplication. This means
- *                      \p i + 1 limbs if \p d[\p i - 1] started as 0 and \p b
- *                      is not known a priori.
- * \param b             A scalar to multiply.
- */
-static
-#if defined(__APPLE__) && defined(__arm__)
-/*
- * Apple LLVM version 4.2 (clang-425.0.24) (based on LLVM 3.2svn)
- * appears to need this to prevent bad ARM code generation at -O3.
- */
-__attribute__ ((noinline))
-#endif
-void mpi_mul_hlp( size_t i,
-                  const mbedtls_mpi_uint *s,
-                  mbedtls_mpi_uint *d,
-                  mbedtls_mpi_uint b )
+mbedtls_mpi_uint mbedtls_mpi_core_mla( mbedtls_mpi_uint *d, size_t d_len,
+                                       const mbedtls_mpi_uint *s, size_t s_len,
+                                       mbedtls_mpi_uint b )
 {
-    mbedtls_mpi_uint c = 0, t = 0;
+    mbedtls_mpi_uint c = 0; /* carry */
+    size_t excess_len = d_len - s_len;
 
 #if defined(MULADDC_HUIT)
-    for( ; i >= 8; i -= 8 )
+    for( ; s_len >= 8; s_len -= 8 )
     {
         MULADDC_INIT
         MULADDC_HUIT
         MULADDC_STOP
     }
 
-    for( ; i > 0; i-- )
+    for( ; s_len > 0; s_len-- )
     {
         MULADDC_INIT
         MULADDC_CORE
         MULADDC_STOP
     }
 #else /* MULADDC_HUIT */
-    for( ; i >= 16; i -= 16 )
+    for( ; s_len >= 16; s_len -= 16 )
     {
         MULADDC_INIT
         MULADDC_CORE   MULADDC_CORE
@@ -1430,7 +1407,7 @@ void mpi_mul_hlp( size_t i,
         MULADDC_STOP
     }
 
-    for( ; i >= 8; i -= 8 )
+    for( ; s_len >= 8; s_len -= 8 )
     {
         MULADDC_INIT
         MULADDC_CORE   MULADDC_CORE
@@ -1441,7 +1418,7 @@ void mpi_mul_hlp( size_t i,
         MULADDC_STOP
     }
 
-    for( ; i > 0; i-- )
+    for( ; s_len > 0; s_len-- )
     {
         MULADDC_INIT
         MULADDC_CORE
@@ -1449,12 +1426,12 @@ void mpi_mul_hlp( size_t i,
     }
 #endif /* MULADDC_HUIT */
 
-    t++;
-
-    while( c != 0 )
+    while( excess_len-- )
     {
         *d += c; c = ( *d < c ); d++;
     }
+
+    return( c );
 }
 
 /*
@@ -1490,8 +1467,14 @@ int mbedtls_mpi_mul_mpi( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi
     MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, i + j ) );
     MBEDTLS_MPI_CHK( mbedtls_mpi_lset( X, 0 ) );
 
-    for( ; j > 0; j-- )
-        mpi_mul_hlp( i, A->p, X->p + j - 1, B->p[j - 1] );
+    for( size_t k = 0; k < j; k++ )
+    {
+        /* We know that there cannot be any carry-out since we're
+         * iterating from bottom to top. */
+        (void) mbedtls_mpi_core_mla( X->p + k, i + 1,
+                                     A->p, i,
+                                     B->p[k] );
+    }
 
     /* If the result is 0, we don't shortcut the operation, which reduces
      * but does not eliminate side channels leaking the zero-ness. We do
@@ -1517,19 +1500,15 @@ int mbedtls_mpi_mul_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_uint
     MPI_VALIDATE_RET( X != NULL );
     MPI_VALIDATE_RET( A != NULL );
 
-    /* mpi_mul_hlp can't deal with a leading 0. */
     size_t n = A->n;
     while( n > 0 && A->p[n - 1] == 0 )
         --n;
 
-    /* The general method below doesn't work if n==0 or b==0. By chance
-     * calculating the result is trivial in those cases. */
+    /* The general method below doesn't work if b==0. */
     if( b == 0 || n == 0 )
-    {
         return( mbedtls_mpi_lset( X, 0 ) );
-    }
 
-    /* Calculate A*b as A + A*(b-1) to take advantage of mpi_mul_hlp */
+    /* Calculate A*b as A + A*(b-1) to take advantage of mbedtls_mpi_core_mla */
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     /* In general, A * b requires 1 limb more than b. If
      * A->p[n - 1] * b / b == A->p[n - 1], then A * b fits in the same
@@ -1538,10 +1517,13 @@ int mbedtls_mpi_mul_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_uint
      * making the call to grow() unconditional causes slightly fewer
      * calls to calloc() in ECP code, presumably because it reuses the
      * same mpi for a while and this way the mpi is more likely to directly
-     * grow to its final size. */
+     * grow to its final size.
+     *
+     * Note that calculating A*b as 0 + A*b doesn't work as-is because
+     * A,X can be the same. */
     MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, n + 1 ) );
     MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, A ) );
-    mpi_mul_hlp( n, A->p, X->p, b - 1 );
+    mbedtls_mpi_core_mla( X->p, X->n, A->p, n, b - 1 );
 
 cleanup:
     return( ret );
@@ -1907,8 +1889,8 @@ static void mpi_montg_init( mbedtls_mpi_uint *mm, const mbedtls_mpi *N )
  * \param           mm  The value calculated by `mpi_montg_init(&mm, N)`.
  *                      This is -N^-1 mod 2^ciL.
  * \param[in,out]   T   A bignum for temporary storage.
- *                      It must be at least twice the limb size of N plus 2
- *                      (T->n >= 2 * (N->n + 1)).
+ *                      It must be at least twice the limb size of N plus 1
+ *                      (T->n >= 2 * N->n + 1).
  *                      Its initial content is unused and
  *                      its final content is indeterminate.
  *                      Note that unlike the usual convention in the library
@@ -1917,8 +1899,8 @@ static void mpi_montg_init( mbedtls_mpi_uint *mm, const mbedtls_mpi *N )
 static void mpi_montmul( mbedtls_mpi *A, const mbedtls_mpi *B, const mbedtls_mpi *N, mbedtls_mpi_uint mm,
                          const mbedtls_mpi *T )
 {
-    size_t i, n, m;
-    mbedtls_mpi_uint u0, u1, *d;
+    size_t n, m;
+    mbedtls_mpi_uint *d;
 
     memset( T->p, 0, T->n * ciL );
 
@@ -1926,18 +1908,23 @@ static void mpi_montmul( mbedtls_mpi *A, const mbedtls_mpi *B, const mbedtls_mpi
     n = N->n;
     m = ( B->n < n ) ? B->n : n;
 
-    for( i = 0; i < n; i++ )
+    for( size_t i = 0; i < n; i++ )
     {
+        mbedtls_mpi_uint u0, u1;
+
         /*
          * T = (T + u0*B + u1*N) / 2^biL
          */
         u0 = A->p[i];
         u1 = ( d[0] + u0 * B->p[0] ) * mm;
 
-        mpi_mul_hlp( m, B->p, d, u0 );
-        mpi_mul_hlp( n, N->p, d, u1 );
-
-        d++; d[n + 1] = 0;
+        (void) mbedtls_mpi_core_mla( d, n + 2,
+                                     B->p, m,
+                                     u0 );
+        (void) mbedtls_mpi_core_mla( d, n + 2,
+                                     N->p, n,
+                                     u1 );
+        d++;
     }
 
     /* At this point, d is either the desired result or the desired result
diff --git a/library/bignum_internal.h b/library/bignum_internal.h
new file mode 100644
index 000000000000..8677dcf1f01f
--- /dev/null
+++ b/library/bignum_internal.h
@@ -0,0 +1,50 @@
+/**
+ *  Internal bignum functions
+ *
+ *  Copyright The Mbed TLS Contributors
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef MBEDTLS_BIGNUM_INTERNAL_H
+#define MBEDTLS_BIGNUM_INTERNAL_H
+
+#include "common.h"
+
+#if defined(MBEDTLS_BIGNUM_C)
+#include "mbedtls/bignum.h"
+#endif
+
+/** Perform a known-size multiply accumulate operation
+ *
+ * Add \p b * \p s to \p d.
+ *
+ * \param[in,out] d     The pointer to the (little-endian) array
+ *                      representing the bignum to accumulate onto.
+ * \param d_len         The number of limbs of \p d. This must be
+ *                      at least \p s_len.
+ * \param[in] s         The pointer to the (little-endian) array
+ *                      representing the bignum to multiply with.
+ *                      This may be the same as \p d. Otherwise,
+ *                      it must be disjoint from \p d.
+ * \param s_len         The number of limbs of \p s.
+ * \param b             A scalar to multiply with.
+ *
+ * \return c            The carry at the end of the operation.
+ */
+mbedtls_mpi_uint mbedtls_mpi_core_mla( mbedtls_mpi_uint *d, size_t d_len ,
+                                       const mbedtls_mpi_uint *s, size_t s_len,
+                                       mbedtls_mpi_uint b );
+
+#endif /* MBEDTLS_BIGNUM_INTERNAL_H */
diff --git a/library/bn_mul.h b/library/bn_mul.h
index b71ddd881a04..aa1183fa5410 100644
--- a/library/bn_mul.h
+++ b/library/bn_mul.h
@@ -99,6 +99,7 @@
 #if defined(__i386__) && defined(__OPTIMIZE__)
 
 #define MULADDC_INIT                        \
+    { mbedtls_mpi_uint t;                   \
     asm(                                    \
         "movl   %%ebx, %0           \n\t"   \
         "movl   %5, %%esi           \n\t"   \
@@ -190,7 +191,8 @@
         : "=m" (t), "=m" (c), "=m" (d), "=m" (s)        \
         : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b)   \
         : "eax", "ebx", "ecx", "edx", "esi", "edi"      \
-    );
+    ); }                                                \
+
 
 #else
 
@@ -202,7 +204,7 @@
         : "=m" (t), "=m" (c), "=m" (d), "=m" (s)        \
         : "m" (t), "m" (s), "m" (d), "m" (c), "m" (b)   \
         : "eax", "ebx", "ecx", "edx", "esi", "edi"      \
-    );
+    ); }
 #endif /* SSE2 */
 #endif /* i386 */
 
diff --git a/library/ecp_curves.c b/library/ecp_curves.c
index 421a067bbfe7..6b8ff5c7fba4 100644
--- a/library/ecp_curves.c
+++ b/library/ecp_curves.c
@@ -26,6 +26,7 @@
 #include "mbedtls/error.h"
 
 #include "bn_mul.h"
+#include "bignum_internal.h"
 #include "ecp_invasive.h"
 
 #include <string.h>
@@ -5213,40 +5214,30 @@ static int ecp_mod_p521( mbedtls_mpi *N )
 
 /*
  * Fast quasi-reduction modulo p255 = 2^255 - 19
- * Write N as A0 + 2^255 A1, return A0 + 19 * A1
+ * Write N as A0 + 2^256 A1, return A0 + 38 * A1
  */
 static int ecp_mod_p255( mbedtls_mpi *N )
 {
-    int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    size_t i;
-    mbedtls_mpi M;
-    mbedtls_mpi_uint Mp[P255_WIDTH + 2];
+    mbedtls_mpi_uint Mp[P255_WIDTH];
 
-    if( N->n < P255_WIDTH )
+    /* Helper references for top part of N */
+    mbedtls_mpi_uint * const NT_p = N->p + P255_WIDTH;
+    const size_t NT_n = N->n - P255_WIDTH;
+    if( N->n <= P255_WIDTH )
         return( 0 );
-
-    /* M = A1 */
-    M.s = 1;
-    M.n = N->n - ( P255_WIDTH - 1 );
-    if( M.n > P255_WIDTH + 1 )
+    if( NT_n > P255_WIDTH )
         return( MBEDTLS_ERR_ECP_BAD_INPUT_DATA );
-    M.p = Mp;
-    memset( Mp, 0, sizeof Mp );
-    memcpy( Mp, N->p + P255_WIDTH - 1, M.n * sizeof( mbedtls_mpi_uint ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_shift_r( &M, 255 % ( 8 * sizeof( mbedtls_mpi_uint ) ) ) );
-    M.n++; /* Make room for multiplication by 19 */
 
-    /* N = A0 */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_set_bit( N, 255, 0 ) );
-    for( i = P255_WIDTH; i < N->n; i++ )
-        N->p[i] = 0;
+    /* Split N as N + 2^256 M */
+    memcpy( Mp,   NT_p, sizeof( mbedtls_mpi_uint ) * NT_n );
+    memset( NT_p, 0,    sizeof( mbedtls_mpi_uint ) * NT_n );
 
-    /* N = A0 + 19 * A1 */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_mul_int( &M, &M, 19 ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_add_abs( N, N, &M ) );
+    /* N = A0 + 38 * A1 */
+    mbedtls_mpi_core_mla( N->p, P255_WIDTH + 1,
+                          Mp, NT_n,
+                          38 );
 
-cleanup:
-    return( ret );
+    return( 0 );
 }
 #endif /* MBEDTLS_ECP_DP_CURVE25519_ENABLED */