From 66c0c0292e9017c4281ee3f2af90dce33dc82f7a Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 10 Jun 2026 08:55:02 -0400
Subject: [PATCH 1/8] Add fast path for 256/128 division

---
 include/boost/decimal/detail/u256.hpp | 110 ++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp
index aa524af15..58645df88 100644
--- a/include/boost/decimal/detail/u256.hpp
+++ b/include/boost/decimal/detail/u256.hpp
@@ -1120,6 +1120,74 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u
     return quotient;
 }
 
+#ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+
+// MG 3/2 fast path for u256 divided by a uint128 divisor.
+//
+// Normalizes the divisor so its MSB is set, applies the same shift to the
+// dividend (which never overflows past 256 bits for the d128 division flow
+// -- the top uint64 of `big_sig = lhs.sig * 10^34` is at most ~34 bits, and
+// the shift amount is at most ~17), runs two chained 3/2 inner divides, and
+// un-normalizes the remainder.
+//
+// Returns true when the fast path applied (`q` and `r` populated), false
+// when the divisor's high half is zero (caller should use the single-limb
+// path) or when the dividend's top 128 bits are not strictly less than the
+// normalized divisor (the 3/2 algorithm's precondition for a single-limb
+// quotient).
+BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128(
+    u256 u, int128::uint128_t d,
+    int128::uint128_t& q, int128::uint128_t& r) noexcept
+{
+    if (d.high == 0U)
+    {
+        return false; // single-limb divisor: caller takes the fast 4x uint128/uint64 path
+    }
+
+    // Normalize d so its MSB is set.
+    const int shift_amount {int128::detail::countl_zero(d.high)};
+    if (shift_amount > 0)
+    {
+        d <<= shift_amount;
+        const int complement {64 - shift_amount};
+        // Shift u left by `shift_amount` across all 4 limbs. If the top limb
+        // would overflow past 256 bits, bail out; the caller will use the
+        // existing Knuth-D path.
+        if ((u.bytes[3] >> complement) != 0U)
+        {
+            return false;
+        }
+        u.bytes[3] = (u.bytes[3] << shift_amount) | (u.bytes[2] >> complement);
+        u.bytes[2] = (u.bytes[2] << shift_amount) | (u.bytes[1] >> complement);
+        u.bytes[1] = (u.bytes[1] << shift_amount) | (u.bytes[0] >> complement);
+        u.bytes[0] = (u.bytes[0] << shift_amount);
+    }
+
+    // Precondition: top 128 bits of u must be strictly less than d (the 3/2
+    // algorithm computes a one-limb quotient and breaks otherwise). When this
+    // doesn't hold the result would need a 3-limb quotient and we fall back.
+    const int128::uint128_t u_top {u.bytes[3], u.bytes[2]};
+    if (!(u_top < d))
+    {
+        return false;
+    }
+
+    const std::uint64_t v {int128::detail::impl::mg32_reciprocal_2by1(d)};
+
+    int128::uint128_t r1{};
+    const std::uint64_t q_high {int128::detail::impl::mg32_div_3by2(u_top, u.bytes[1], d, v, r1)};
+
+    int128::uint128_t r2{};
+    const std::uint64_t q_low {int128::detail::impl::mg32_div_3by2(r1, u.bytes[0], d, v, r2)};
+
+    q = int128::uint128_t{q_high, q_low};
+    // Un-normalize the remainder.
+    r = (shift_amount > 0) ? (r2 >> shift_amount) : r2;
+    return true;
+}
+
+#endif // BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+
 template <typename UnsignedInteger>
 BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u256& lhs, const UnsignedInteger& rhs) noexcept
 {
@@ -1132,6 +1200,25 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u
         return u256{};
     }
 
+    #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+    // MG 3/2 fast path. Replaces ~3 hardware divides in the 32-bit Knuth-D
+    // outer loop with one reciprocal-compute plus two cheap 3/2 inner steps.
+    // Bails out (returns false) for inputs that don't fit the algorithm's
+    // single-limb-quotient shape; those fall through to Knuth-D below.
+    {
+        const int128::uint128_t rhs_u128 {static_cast<int128::uint128_t>(rhs)};
+        int128::uint128_t mg_q{};
+        int128::uint128_t mg_r{};
+        if (mg32_u256_by_u128(lhs, rhs_u128, mg_q, mg_r))
+        {
+            u256 result{};
+            result.bytes[0] = mg_q.low;
+            result.bytes[1] = mg_q.high;
+            return result;
+        }
+    }
+    #endif
+
     std::uint32_t u[8] {};
     std::uint32_t v[8] {};
     std::uint32_t q[8] {};
@@ -1155,6 +1242,29 @@ struct u256_divmod_result
 template <typename UnsignedInteger>
 BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_mod(const u256& lhs, const UnsignedInteger& rhs) noexcept -> u256_divmod_result
 {
+    #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+    // MG 3/2 fast path for u256/uint128. See default_div for rationale.
+    // Single-limb divisors fall through to the 32-bit Knuth-D path below,
+    // which has a dedicated n==1 short-circuit.
+    {
+        const int128::uint128_t rhs_u128 {static_cast<int128::uint128_t>(rhs)};
+        if (rhs_u128.high != 0U)
+        {
+            int128::uint128_t mg_q{};
+            int128::uint128_t mg_r{};
+            if (mg32_u256_by_u128(lhs, rhs_u128, mg_q, mg_r))
+            {
+                u256_divmod_result out{};
+                out.quotient.bytes[0] = mg_q.low;
+                out.quotient.bytes[1] = mg_q.high;
+                out.remainder.bytes[0] = mg_r.low;
+                out.remainder.bytes[1] = mg_r.high;
+                return out;
+            }
+        }
+    }
+    #endif
+
     std::uint32_t u[8] {};
     std::uint32_t v[8] {};
     std::uint32_t q[8] {};

From 3fc7eccad5022b6b9e500cc4fda6f25dd23afc86 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 10 Jun 2026 08:55:45 -0400
Subject: [PATCH 2/8] Gate architecture based quick div

---
 .../detail/int128/detail/uint128_imp.hpp      | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/include/boost/decimal/detail/int128/detail/uint128_imp.hpp b/include/boost/decimal/detail/int128/detail/uint128_imp.hpp
index 8e1a26d57..3f430fecb 100644
--- a/include/boost/decimal/detail/int128/detail/uint128_imp.hpp
+++ b/include/boost/decimal/detail/int128/detail/uint128_imp.hpp
@@ -3028,6 +3028,30 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint
     #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__)
     else
     {
+        // On x86-64 GCC/Clang the __int128 builtin lowers to __udivti3 which
+        // runs Knuth-D (~70-90c). Route runtime calls to the Moller-Granlund
+        // 2x2 divide that backs the MSVC path (~25-30c). constexpr context
+        // still uses the builtin (inline asm is not constexpr).
+        #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) && (defined(__GNUC__) || defined(__clang__))
+        if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs))
+        {
+            // uint128/uint64 short-circuit: div_mod_intrinsic's 2x2 path
+            // forces countl_zero(divisor.high)=64 -> shift by 64, producing a
+            // correct but extra-work result. one_word_div hits a single divq.
+            if (rhs.high == 0U)
+            {
+                if (lhs.high == 0U)
+                {
+                    return {0, lhs.low / rhs.low};
+                }
+                uint128_t quotient {};
+                detail::one_word_div(lhs, rhs.low, quotient);
+                return quotient;
+            }
+            uint128_t remainder{};
+            return detail::impl::div_mod_intrinsic<false>(lhs, rhs, remainder);
+        }
+        #endif
         return static_cast<uint128_t>(static_cast<detail::builtin_u128>(lhs) / static_cast<detail::builtin_u128>(rhs));
     }
     #else
@@ -3239,6 +3263,28 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint
     #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__)
     else
     {
+        // See operator/ for the rationale on routing x86-64 GCC/Clang
+        // runtime calls through the Moller-Granlund 2x2 divide.
+        #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) && (defined(__GNUC__) || defined(__clang__))
+        if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs))
+        {
+            // uint128/uint64 short-circuit (see operator/ for the rationale).
+            if (rhs.high == 0U)
+            {
+                if (lhs.high == 0U)
+                {
+                    return {0, lhs.low % rhs.low};
+                }
+                uint128_t quotient {};
+                uint128_t remainder {};
+                detail::one_word_div(lhs, rhs.low, quotient, remainder);
+                return remainder;
+            }
+            uint128_t remainder{};
+            detail::impl::div_mod_intrinsic<true>(lhs, rhs, remainder);
+            return remainder;
+        }
+        #endif
         return static_cast<uint128_t>(static_cast<detail::builtin_u128>(lhs) % static_cast<detail::builtin_u128>(rhs));
     }
     #else

From b7b2d0254950080bf5309f5486ef78c8d6ccbdea Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 10 Jun 2026 08:56:02 -0400
Subject: [PATCH 3/8] Unify division impl

---
 include/boost/decimal/detail/div_impl.hpp | 369 +++++++++++++++++++---
 1 file changed, 327 insertions(+), 42 deletions(-)

diff --git a/include/boost/decimal/detail/div_impl.hpp b/include/boost/decimal/detail/div_impl.hpp
index cf6f70050..046210c5b 100644
--- a/include/boost/decimal/detail/div_impl.hpp
+++ b/include/boost/decimal/detail/div_impl.hpp
@@ -5,7 +5,12 @@
 #ifndef BOOST_DECIMAL_DETAIL_DIV_IMPL_HPP
 #define BOOST_DECIMAL_DETAIL_DIV_IMPL_HPP
 
-#include <boost/decimal/detail/config.hpp>
+#include <boost/decimal/detail/attributes.hpp>
+#include <boost/decimal/detail/add_impl.hpp>
+#include <boost/decimal/detail/fenv_rounding.hpp>
+#include <boost/decimal/detail/normalize.hpp>
+#include <boost/decimal/detail/power_tables.hpp>
+#include <boost/decimal/detail/components.hpp>
 #include <boost/decimal/detail/u256.hpp>
 #include "int128.hpp"
 
@@ -18,76 +23,356 @@ namespace boost {
 namespace decimal {
 namespace detail {
 
+#ifdef _MSC_VER
+#  pragma warning(push)
+#  pragma warning(disable : 4127) // Conditional expression is constant (pre-C++17 if-constexpr fallback)
+#endif
+
+namespace impl {
+
+// Default-rounding gate. The inline round-half-to-even (RHTE) in the
+// div_finalize_* helpers assumes the active rounding mode is
+// fe_dec_to_nearest. Other modes route through the constructor handoff
+// (which dispatches to fenv_round honoring the runtime mode).
+template <typename Anchor>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_default_rounding(const Anchor& anchor) noexcept -> bool
+{
+    static_cast<void>(anchor);
+    bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest};
+    #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+    if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(anchor))
+    {
+        default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest);
+    }
+    #endif
+    return default_rounding;
+}
+
+// Division finalization with inline round-half-to-even. The driver computes
+// q = (lhs.sig * 10^p) / rhs.sig and r = (lhs.sig * 10^p) % rhs.sig where
+// both operand significands are at full precision p. The natural remainder
+// r encodes the exact sticky bit needed for correct RHTE.
+//
+// With both operands in [10^(p-1), 10^p), the quotient is in
+// [10^(p-1), 10^(p+1)), i.e. exactly p or p+1 digits.
+//
+// Case 1 (q < 10^p): q already has p digits. Round up iff
+//   2r > divisor                            (computed as r > divisor - r to
+//                                            avoid overflow when divisor is
+//                                            near the type's max)
+//   OR (2r == divisor AND q is odd).
+//
+// Case 2 (q >= 10^p): q has p+1 digits, shrink by one digit. Let
+// q' = q/10, r_q = q%10. The true fractional part is
+// f = r_q/10 + r/(10*divisor). Round up iff
+//   r_q > 5
+//   OR (r_q == 5 AND r > 0)
+//   OR (r_q == 5 AND r == 0 AND q' is odd).
+//
+// In either case the post-rounding carry q' == 10^p shifts down to 10^(p-1)
+// and bumps the exponent by one more digit.
+
+// d32/fast32 finalizer. Dividend = lhs.sig * 10^7 fits in uint64; quotient is
+// at most 10^8 - 1 which fits in uint32; remainder is bounded by the divisor
+// which fits in uint32 (rhs.sig < 10^7).
+template <typename ReturnType, typename ExpType>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_finalize_u64(
+    std::uint64_t q, std::uint64_t r, std::uint32_t divisor,
+    ExpType result_exp, bool sign) noexcept -> ReturnType
+{
+    constexpr auto ten_to_p {pow10(static_cast<std::uint64_t>(detail::precision_v<ReturnType>))};
+    constexpr auto ten_to_p_minus_1 {pow10(static_cast<std::uint64_t>(detail::precision_v<ReturnType> - 1))};
+
+    int extra {0};
+
+    if (q >= ten_to_p)
+    {
+        extra = 1;
+        const auto r_q {static_cast<unsigned>(q % UINT64_C(10))};
+        q /= UINT64_C(10);
+
+        const bool round_up {r_q > 5U ||
+                             (r_q == 5U && (r != UINT64_C(0) || (q & UINT64_C(1)) != UINT64_C(0)))};
+        if (round_up)
+        {
+            ++q;
+            if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p))
+            {
+                q = ten_to_p_minus_1;
+                ++extra;
+            }
+        }
+    }
+    else
+    {
+        const auto half_div {static_cast<std::uint64_t>(divisor) - r};
+        const bool round_up {r > half_div || (r == half_div && (q & UINT64_C(1)) != UINT64_C(0))};
+        if (round_up)
+        {
+            ++q;
+            if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p))
+            {
+                q = ten_to_p_minus_1;
+                ++extra;
+            }
+        }
+    }
+
+    using sig_type = typename ReturnType::significand_type;
+    return pack_in_range<ReturnType>(static_cast<sig_type>(q),
+                                     result_exp + static_cast<ExpType>(extra),
+                                     sign);
+}
+
+// d64/fast64 finalizer. Quotient and remainder both fit in uint64 because
+// quotient is at most 10^17 - 1 < 2^57 and remainder is bounded by the
+// uint64 divisor.
+template <typename ReturnType, typename ExpType>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_finalize_u128(
+    std::uint64_t q, std::uint64_t r, std::uint64_t divisor,
+    ExpType result_exp, bool sign) noexcept -> ReturnType
+{
+    constexpr auto ten_to_p {pow10(static_cast<std::uint64_t>(detail::precision_v<ReturnType>))};
+    constexpr auto ten_to_p_minus_1 {pow10(static_cast<std::uint64_t>(detail::precision_v<ReturnType> - 1))};
+
+    int extra {0};
+
+    if (q >= ten_to_p)
+    {
+        extra = 1;
+        const auto r_q {static_cast<unsigned>(q % UINT64_C(10))};
+        q /= UINT64_C(10);
+
+        const bool round_up {r_q > 5U ||
+                             (r_q == 5U && (r != UINT64_C(0) || (q & UINT64_C(1)) != UINT64_C(0)))};
+        if (round_up)
+        {
+            ++q;
+            if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p))
+            {
+                q = ten_to_p_minus_1;
+                ++extra;
+            }
+        }
+    }
+    else
+    {
+        const auto half_div {divisor - r};
+        const bool round_up {r > half_div || (r == half_div && (q & UINT64_C(1)) != UINT64_C(0))};
+        if (round_up)
+        {
+            ++q;
+            if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p))
+            {
+                q = ten_to_p_minus_1;
+                ++extra;
+            }
+        }
+    }
+
+    using sig_type = typename ReturnType::significand_type;
+    return pack_in_range<ReturnType>(static_cast<sig_type>(q),
+                                     result_exp + static_cast<ExpType>(extra),
+                                     sign);
+}
+
+// d128/fast128 finalizer. Quotient and remainder both fit in uint128 because
+// quotient is at most 10^35 - 1 (< 2^117) and remainder is bounded by the
+// uint128 divisor (< 10^34).
+template <typename ReturnType, typename ExpType>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_finalize_u256(
+    int128::uint128_t q, int128::uint128_t r, int128::uint128_t divisor,
+    ExpType result_exp, bool sign) noexcept -> ReturnType
+{
+    constexpr auto ten_to_p {pow10(int128::uint128_t{static_cast<std::uint64_t>(detail::precision_v<ReturnType>)})};
+    constexpr auto ten_to_p_minus_1 {pow10(int128::uint128_t{static_cast<std::uint64_t>(detail::precision_v<ReturnType> - 1)})};
+
+    int extra {0};
+
+    if (q >= ten_to_p)
+    {
+        extra = 1;
+        const auto dr_q {divmod10(q)};
+        const auto r_q {static_cast<unsigned>(dr_q.remainder)};
+        q = dr_q.quotient;
+
+        const bool round_up {r_q > 5U ||
+                             (r_q == 5U && (r != int128::uint128_t{0U} || (q.low & UINT64_C(1)) != UINT64_C(0)))};
+        if (round_up)
+        {
+            ++q;
+            if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p))
+            {
+                q = ten_to_p_minus_1;
+                ++extra;
+            }
+        }
+    }
+    else
+    {
+        const auto half_div {divisor - r};
+        const bool round_up {r > half_div || (r == half_div && (q.low & UINT64_C(1)) != UINT64_C(0))};
+        if (round_up)
+        {
+            ++q;
+            if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p))
+            {
+                q = ten_to_p_minus_1;
+                ++extra;
+            }
+        }
+    }
+
+    return pack_in_range<ReturnType>(q,
+                                     result_exp + static_cast<ExpType>(extra),
+                                     sign);
+}
+
+} // namespace impl
+
+// d32/fast32 division driver. Accepts a decimal type or a components struct
+// (both expose to_components()) and dispatches to the inline RHTE fast path
+// when the active rounding mode is fe_dec_to_nearest. Non-default rounding
+// modes fall back to the original wide-divide + constructor handoff so the
+// constructor's coefficient_rounding can dispatch to fenv_round.
 template <typename DecimalType, typename T>
 BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto generic_div_impl(const T& lhs, const T& rhs) noexcept -> DecimalType
 {
-    // If rhs is greater than we need to offset the significands to get the correct values
-    // e.g. 4/8 is 0 but 40/8 yields 5 in integer maths
-    //
-    // By expanding the offset to all the way to the value of numeric_limits<std::uint64_t>::digits10
-    // we can recover more of what would become the fraction to achieve better rounding
+    auto lhs_c {lhs.to_components()};
+    auto rhs_c {rhs.to_components()};
 
-    using div_type = std::uint64_t;
+    detail::expand_significand<DecimalType>(lhs_c.sig, lhs_c.exp);
+    detail::expand_significand<DecimalType>(rhs_c.sig, rhs_c.exp);
 
-    constexpr auto precision_offset {std::numeric_limits<div_type>::digits10 - precision};
-    constexpr auto ten_pow_offset {detail::pow10(static_cast<div_type>(precision_offset))};
+    const bool sign {lhs_c.sign != rhs_c.sign};
 
-    const auto big_sig_lhs {lhs.full_significand() * ten_pow_offset};
+    if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == 0U))
+    {
+        using sig_type = typename decltype(lhs_c)::significand_type;
+        return DecimalType{sig_type{0U}, lhs_c.exp - rhs_c.exp, sign};
+    }
 
-    const auto res_sig {big_sig_lhs / rhs.full_significand()};
-    const auto res_exp {(lhs.biased_exponent() - precision_offset) - rhs.biased_exponent()};
+    if (BOOST_DECIMAL_UNLIKELY(!impl::div_default_rounding(lhs_c.sig)))
+    {
+        // Non-default rounding mode: hand the wide quotient to the
+        // constructor's coefficient_rounding so fenv_round honors the
+        // runtime mode. The wider offset preserves the original behavior.
+        constexpr auto wide_offset {std::numeric_limits<std::uint64_t>::digits10 - precision_v<DecimalType>};
+        constexpr auto wide_tens {pow10(static_cast<std::uint64_t>(wide_offset))};
+        const auto wide_sig {static_cast<std::uint64_t>(lhs_c.sig) * wide_tens};
+        const auto wide_q {wide_sig / static_cast<std::uint64_t>(rhs_c.sig)};
+        const auto wide_exp {(lhs_c.exp - static_cast<int>(wide_offset)) - rhs_c.exp};
+        return DecimalType{wide_q, wide_exp, sign};
+    }
 
-    // IEEE 754-2008: division uses sign(x) XOR sign(y) for all results, including zero
-    const bool sign {lhs.isneg() != rhs.isneg()};
+    constexpr auto ten_to_p {pow10(static_cast<std::uint64_t>(precision_v<DecimalType>))};
+    const auto big_sig {static_cast<std::uint64_t>(lhs_c.sig) * ten_to_p};
+    const auto divisor {static_cast<std::uint64_t>(rhs_c.sig)};
+    const auto q {big_sig / divisor};
+    const auto r {big_sig - q * divisor};
+    const auto res_exp {(lhs_c.exp - static_cast<int>(precision_v<DecimalType>)) - rhs_c.exp};
 
-    // Let the constructor handle shrinking it back down and rounding correctly
-    return DecimalType{res_sig, res_exp, sign};
+    return impl::div_finalize_u64<DecimalType>(q, r, static_cast<std::uint32_t>(divisor), res_exp, sign);
 }
 
+// d64/fast64 division driver. Same structure as the d32 driver but uses
+// uint128 for the pre-scaled dividend (lhs.sig * 10^16 overflows uint64).
+// The quotient is at most 10^17 - 1 < 2^57 so it narrows safely to uint64
+// for the finalizer.
 template <typename DecimalType, typename T>
 BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_generic_div_impl(const T& lhs, const T& rhs, const bool sign) noexcept -> DecimalType
 {
     using unsigned_int128_type = boost::int128::uint128_t;
 
-    // If rhs is greater than we need to offset the significands to get the correct values
-    // e.g. 4/8 is 0 but 40/8 yields 5 in integer maths
-    constexpr auto offset {std::numeric_limits<unsigned_int128_type>::digits10 - detail::precision_v<decimal64_t>};
-    constexpr auto tens_needed {detail::pow10(static_cast<unsigned_int128_type>(offset))};
-    const auto big_sig_lhs {static_cast<unsigned_int128_type>(lhs.sig) * tens_needed};
+    auto lhs_c {lhs.to_components()};
+    auto rhs_c {rhs.to_components()};
 
-    const auto res_sig {big_sig_lhs / rhs.sig};
-    const auto res_exp {(lhs.exp - offset) - rhs.exp};
+    detail::expand_significand<DecimalType>(lhs_c.sig, lhs_c.exp);
+    detail::expand_significand<DecimalType>(rhs_c.sig, rhs_c.exp);
 
-    // Let the constructor handle shrinking it back down and rounding correctly
-    return DecimalType{res_sig, res_exp, sign};
+    if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == 0U))
+    {
+        using sig_type = typename decltype(lhs_c)::significand_type;
+        return DecimalType{sig_type{0U}, lhs_c.exp - rhs_c.exp, sign};
+    }
+
+    if (BOOST_DECIMAL_UNLIKELY(!impl::div_default_rounding(lhs_c.sig)))
+    {
+        constexpr auto wide_offset {std::numeric_limits<unsigned_int128_type>::digits10 - precision_v<DecimalType>};
+        const auto wide_tens {pow10(static_cast<unsigned_int128_type>(wide_offset))};
+        const auto wide_sig {static_cast<unsigned_int128_type>(lhs_c.sig) * wide_tens};
+        const auto wide_q {wide_sig / static_cast<unsigned_int128_type>(rhs_c.sig)};
+        const auto wide_exp {(lhs_c.exp - static_cast<int>(wide_offset)) - rhs_c.exp};
+        return DecimalType{wide_q, wide_exp, sign};
+    }
+
+    constexpr auto ten_to_p {static_cast<unsigned_int128_type>(pow10(static_cast<std::uint64_t>(precision_v<DecimalType>)))};
+    const auto big_sig {static_cast<unsigned_int128_type>(lhs_c.sig) * ten_to_p};
+    const auto divisor {static_cast<std::uint64_t>(rhs_c.sig)};
+    const auto q_wide {big_sig / static_cast<unsigned_int128_type>(divisor)};
+    const auto r_wide {big_sig - q_wide * static_cast<unsigned_int128_type>(divisor)};
+    const auto res_exp {(lhs_c.exp - static_cast<int>(precision_v<DecimalType>)) - rhs_c.exp};
+
+    return impl::div_finalize_u128<DecimalType>(static_cast<std::uint64_t>(q_wide.low),
+                                                static_cast<std::uint64_t>(r_wide.low),
+                                                divisor,
+                                                res_exp,
+                                                sign);
 }
 
-template <typename T>
-BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_generic_div_impl(const T& lhs, const T& rhs, T& q) noexcept -> void
+// d128/fast128 division driver. The pre-scaled dividend lhs.sig * 10^34
+// needs u256 because the product may reach ~10^68 (well above uint128 max).
+// The quotient itself fits in uint128 (<= 10^35) so we narrow before the
+// finalizer.
+template <typename DecimalType, typename T>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_generic_div_impl(const T& lhs, const T& rhs, const bool sign) noexcept -> DecimalType
 {
-    bool sign {lhs.sign != rhs.sign};
+    auto lhs_c {lhs.to_components()};
+    auto rhs_c {rhs.to_components()};
 
-    constexpr auto ten_pow_precision {pow10(int128::uint128_t(detail::precision_v<decimal128_t>))};
-    const auto big_sig_lhs {detail::umul256(lhs.sig, ten_pow_precision)};
+    detail::expand_significand<DecimalType>(lhs_c.sig, lhs_c.exp);
+    detail::expand_significand<DecimalType>(rhs_c.sig, rhs_c.exp);
 
-    auto res_sig {big_sig_lhs / rhs.sig};
-    auto res_exp {lhs.exp - rhs.exp - detail::precision_v<decimal128_t>};
+    if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == int128::uint128_t{0U}))
+    {
+        return DecimalType{int128::uint128_t{0U}, lhs_c.exp - rhs_c.exp, sign};
+    }
 
-    if (res_sig[3] != 0 || res_sig[2] != 0)
+    if (BOOST_DECIMAL_UNLIKELY(!impl::div_default_rounding(lhs_c.sig)))
     {
-        const auto sig_dig {detail::num_digits(res_sig)};
-        const auto digit_delta {sig_dig - std::numeric_limits<int128::uint128_t>::digits10};
-        res_sig /= pow10(int128::uint128_t(digit_delta));
-        res_exp += digit_delta;
+        const auto wide_tens {pow10(int128::uint128_t{static_cast<std::uint64_t>(precision_v<DecimalType>)})};
+        const auto wide_sig {detail::umul256(lhs_c.sig, wide_tens)};
+        auto wide_q {wide_sig / rhs_c.sig};
+        auto wide_exp {lhs_c.exp - rhs_c.exp - static_cast<int>(precision_v<DecimalType>)};
+
+        if (wide_q[3] != 0U || wide_q[2] != 0U)
+        {
+            const auto sig_dig {detail::num_digits(wide_q)};
+            const auto digit_delta {sig_dig - std::numeric_limits<int128::uint128_t>::digits10};
+            wide_q /= pow10(int128::uint128_t{static_cast<std::uint64_t>(digit_delta)});
+            wide_exp += digit_delta;
+        }
+
+        BOOST_DECIMAL_ASSERT((wide_q[3] | wide_q[2]) == 0U);
+        return DecimalType{int128::uint128_t{wide_q[1], wide_q[0]}, wide_exp, sign};
     }
-    // IEEE 754-2008: division uses sign(x) XOR sign(y) for all results, including zero,
-    // so we no longer override `sign` when res_sig is zero.
 
-    // Let the constructor handle shrinking it back down and rounding correctly
-    BOOST_DECIMAL_ASSERT((res_sig[3] | res_sig[2]) == 0U);
-    q = T {int128::uint128_t{res_sig[1], res_sig[0]}, res_exp, sign};
+    constexpr auto ten_to_p {pow10(int128::uint128_t{static_cast<std::uint64_t>(precision_v<DecimalType>)})};
+    const auto big_sig {detail::umul256(lhs_c.sig, ten_to_p)};
+    const auto divisor {rhs_c.sig};
+    const auto dr {impl::div_mod(big_sig, divisor)};
+
+    const int128::uint128_t q {dr.quotient.bytes[1], dr.quotient.bytes[0]};
+    const int128::uint128_t r {dr.remainder.bytes[1], dr.remainder.bytes[0]};
+    const auto res_exp {lhs_c.exp - rhs_c.exp - static_cast<int>(precision_v<DecimalType>)};
+
+    return impl::div_finalize_u256<DecimalType>(q, r, divisor, res_exp, sign);
 }
 
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
 } // namespace detail
 } // namespace decimal
 } // namespace boost

From 1536ee42960173386131af1f5dce6dc2740f4691 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 10 Jun 2026 08:56:31 -0400
Subject: [PATCH 4/8] Add intrinsic and assembly based fast paths

---
 .../detail/int128/detail/common_div.hpp       | 289 ++++++++++++++++--
 1 file changed, 256 insertions(+), 33 deletions(-)

diff --git a/include/boost/decimal/detail/int128/detail/common_div.hpp b/include/boost/decimal/detail/int128/detail/common_div.hpp
index 263ecd02c..d5d5c828b 100644
--- a/include/boost/decimal/detail/int128/detail/common_div.hpp
+++ b/include/boost/decimal/detail/int128/detail/common_div.hpp
@@ -304,17 +304,105 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE
     return {static_cast<high_word_type>(high), low};
 }
 
+// HAS_FAST_DIV128 is true on platforms where we expose a hardware 128-by-64
+// division primitive: MSVC x64 via _udiv128, and GCC/Clang on x86-64 via
+// inline divq. Both flavors share the same Moller-Granlund-style 2x2 division
+// scaffolding below, parameterized over portable umul128/udiv128 helpers.
+//
+// AArch64 stays on the __udivti3 / Knuth-D path -- UDIV is already 7c/0.5
+// there and the inline-divq port is x86-only. CUDA device code is excluded
+// because inline asm and _udiv128 are not available in that environment.
 #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
+#  define BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+#elif defined(BOOST_DECIMAL_DETAIL_INT128_HAS_INT128) \
+   && (defined(__GNUC__) || defined(__clang__))      \
+   && defined(__x86_64__)                            \
+   && !defined(__CUDA_ARCH__)
+#  define BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+#endif
+
+#ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+
+// MSVC's _addcarry_u64 takes std::uint64_t* (unsigned __int64*); GCC/Clang's
+// _addcarry_u64 takes unsigned long long*, which on LP64 Linux is a distinct
+// type from std::uint64_t (typedef'd to unsigned long there). Wrap the macros
+// with helpers that accept std::uint64_t* and convert internally.
+BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE unsigned char int128_addcarry_u64(unsigned char c, std::uint64_t x, std::uint64_t y, std::uint64_t* out) noexcept
+{
+#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__)
+    return BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(c, x, y, out);
+#else
+    unsigned long long tmp {static_cast<unsigned long long>(*out)};
+    const unsigned char result {BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(c, x, y, &tmp)};
+    *out = static_cast<std::uint64_t>(tmp);
+    return result;
+#endif
+}
+
+BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE unsigned char int128_subborrow_u64(unsigned char c, std::uint64_t x, std::uint64_t y, std::uint64_t* out) noexcept
+{
+#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__)
+    return BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(c, x, y, out);
+#else
+    unsigned long long tmp {static_cast<unsigned long long>(*out)};
+    const unsigned char result {BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(c, x, y, &tmp)};
+    *out = static_cast<std::uint64_t>(tmp);
+    return result;
+#endif
+}
+
+// 64x64 -> 128-bit unsigned multiply. MSVC uses the _umul128 intrinsic;
+// GCC/Clang use the native __uint128_t multiply which the compiler lowers
+// to mul/mulx on x86-64. Neither path is usable in a constant expression
+// (callers gate via IS_CONSTANT_EVALUATED), so this is not constexpr.
+BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t int128_umul128_intrinsic(std::uint64_t a, std::uint64_t b, std::uint64_t* hi) noexcept
+{
+#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__)
+    return _umul128(a, b, hi);
+#else
+    const __uint128_t prod {static_cast<__uint128_t>(a) * b};
+    *hi = static_cast<std::uint64_t>(prod >> 64);
+    return static_cast<std::uint64_t>(prod);
+#endif
+}
+
+// 128-bit-by-64-bit unsigned divide. MSVC uses _udiv128; GCC/Clang use
+// inline divq. Caller must guarantee `high < divisor` so divq does not
+// raise #DE (the Moller-Granlund normalization in div_mod_intrinsic
+// below ensures this). Not constexpr because inline asm and _udiv128
+// are runtime-only; div_mod_intrinsic gates its call with
+// IS_CONSTANT_EVALUATED.
+BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t int128_udiv128_intrinsic(std::uint64_t high, std::uint64_t low, std::uint64_t divisor, std::uint64_t* remainder) noexcept
+{
+#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
+    return _udiv128(high, low, divisor, remainder);
+#else
+    std::uint64_t q;
+    std::uint64_t r;
+    __asm__ ("divq %4"
+             : "=a"(q), "=d"(r)
+             : "0"(low), "1"(high), "r"(divisor)
+             : "cc");
+    *remainder = r;
+    return q;
+#endif
+}
 
 template <bool needs_mod, typename T>
-BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T divisor, T& remainder)
+BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_intrinsic(T dividend, T divisor, T& remainder)
 {
     using high_word_type = decltype(T{}.high);
 
     // Skip normalization if divisor is already large enough
     // use direct division and intrinsic
-    // This is only possible in the unsigned case
-    BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR (!std::numeric_limits<T>::is_signed)
+    // This is only possible in the unsigned case. The check uses
+    // std::is_unsigned on the high-word type rather than
+    // std::numeric_limits<T>::is_signed to avoid forcing implicit
+    // instantiation of std::numeric_limits<T> from inside this header (the
+    // explicit specialization sits later in uint128_imp.hpp; instantiating
+    // first via numeric_limits<T> here causes clang to reject the
+    // specialization as "explicit specialization after instantiation").
+    BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR (std::is_unsigned<high_word_type>::value)
     {
         constexpr auto divisor_lower_bound{UINT64_MAX >> 1};
         if (divisor.high >= divisor_lower_bound)
@@ -324,14 +412,16 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d
             quotient.low = static_cast<std::uint64_t>(dividend.high / divisor.high);
 
             std::uint64_t product0_high{};
-            auto product0_low{_umul128(quotient.low, divisor.low, &product0_high)};
+            auto product0_low{int128_umul128_intrinsic(quotient.low, divisor.low, &product0_high)};
 
             std::uint64_t product1_high{};
-            auto product1_low{_umul128(quotient.low, static_cast<std::uint64_t>(divisor.high), &product1_high)};
+            auto product1_low{int128_umul128_intrinsic(quotient.low, static_cast<std::uint64_t>(divisor.high), &product1_high)};
 
             T product{};
             product.low = product0_low;
-            auto carry{BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(0, product0_high, product1_low, reinterpret_cast<std::uint64_t*>(&product.high))};
+            std::uint64_t product_high_tmp {static_cast<std::uint64_t>(product.high)};
+            auto carry{int128_addcarry_u64(0, product0_high, product1_low, &product_high_tmp)};
+            product.high = static_cast<high_word_type>(product_high_tmp);
             product1_high += static_cast<std::uint64_t>(carry);
 
             if (product1_high > 0 || product > dividend)
@@ -339,18 +429,22 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d
                 --quotient.low;
 
                 // Recalculate with adjusted quotient
-                product0_low = _umul128(quotient.low, divisor.low, &product0_high);
-                product1_low = _umul128(quotient.low, divisor.high, &product1_high);
+                product0_low = int128_umul128_intrinsic(quotient.low, divisor.low, &product0_high);
+                product1_low = int128_umul128_intrinsic(quotient.low, static_cast<std::uint64_t>(divisor.high), &product1_high);
 
                 product.low = product0_low;
-                carry = BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(0, product0_high, product1_low, reinterpret_cast<std::uint64_t*>(&product.high));
+                product_high_tmp = static_cast<std::uint64_t>(product.high);
+                carry = int128_addcarry_u64(0, product0_high, product1_low, &product_high_tmp);
+                product.high = static_cast<high_word_type>(product_high_tmp);
                 product1_high += static_cast<std::uint64_t>(carry);
             }
 
             BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(needs_mod)
             {
-                auto borrow{BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(0, dividend.low, product.low, &remainder.low)};
-                BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(borrow, dividend.high, product.high, reinterpret_cast<std::uint64_t*>(&remainder.high));
+                auto borrow{int128_subborrow_u64(0, dividend.low, product.low, &remainder.low)};
+                std::uint64_t remainder_high_tmp {static_cast<std::uint64_t>(remainder.high)};
+                int128_subborrow_u64(borrow, static_cast<std::uint64_t>(dividend.high), static_cast<std::uint64_t>(product.high), &remainder_high_tmp);
+                remainder.high = static_cast<high_word_type>(remainder_high_tmp);
             }
 
             return quotient;
@@ -369,8 +463,8 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d
     quotient.high = high_digit_gte_divisor ? 1 : 0;
     std::uint64_t remainder_estimate {};
 
-    quotient.low = _udiv128(high_digit_gte_divisor ? high_digit - divisor.high : high_digit,
-                            dividend.high, divisor.high, &remainder_estimate);
+    quotient.low = int128_udiv128_intrinsic(high_digit_gte_divisor ? high_digit - divisor.high : high_digit,
+                                            dividend.high, static_cast<std::uint64_t>(divisor.high), &remainder_estimate);
 
     // Bounded correction loop with early exit
     // Typically 2 is the most number of corrections we need since this is only for 2x2 division
@@ -381,7 +475,9 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d
     while (correction_steps < max_corrections)
     {
         T product{};
-        product.low = _umul128(quotient.low, divisor.low, reinterpret_cast<std::uint64_t*>(&product.high));
+        std::uint64_t product_high_tmp {};
+        product.low = int128_umul128_intrinsic(quotient.low, divisor.low, &product_high_tmp);
+        product.high = static_cast<high_word_type>(product_high_tmp);
         if (product <= T{static_cast<high_word_type>(remainder_estimate), dividend.low})
         {
             break;
@@ -400,23 +496,27 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d
 
     // Final verification and adjustment
     std::uint64_t product0_high{};
-    auto product_low {_umul128(quotient.low, divisor.low, &product0_high)};
-    auto borrow {BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(0, dividend.low, product_low, &dividend.low)};
+    auto product_low {int128_umul128_intrinsic(quotient.low, divisor.low, &product0_high)};
+    auto borrow {int128_subborrow_u64(0, dividend.low, product_low, &dividend.low)};
 
     std::uint64_t product1_high{};
-    product_low = _umul128(quotient.low, divisor.high, &product1_high);
-    product1_high += static_cast<std::uint64_t>(BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(0, product_low, product0_high, &product_low));
+    product_low = int128_umul128_intrinsic(quotient.low, static_cast<std::uint64_t>(divisor.high), &product1_high);
+    product1_high += static_cast<std::uint64_t>(int128_addcarry_u64(0, product_low, product0_high, &product_low));
 
-    borrow = BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(borrow, static_cast<std::uint64_t>(dividend.high), product_low, reinterpret_cast<std::uint64_t*>(&dividend.high));
-    borrow = BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(borrow, high_digit, product1_high, &high_digit);
+    std::uint64_t dividend_high_tmp {static_cast<std::uint64_t>(dividend.high)};
+    borrow = int128_subborrow_u64(borrow, static_cast<std::uint64_t>(dividend.high), product_low, &dividend_high_tmp);
+    dividend.high = static_cast<high_word_type>(dividend_high_tmp);
+    borrow = int128_subborrow_u64(borrow, high_digit, product1_high, &high_digit);
     quotient.low -= static_cast<std::uint64_t>(borrow);
 
     BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR (needs_mod)
     {
         if (borrow)
         {
-            auto carry { BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(0, dividend.low, divisor.low, &dividend.low) };
-            BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(carry, static_cast<std::uint64_t>(dividend.high), static_cast<std::uint64_t>(divisor.high), reinterpret_cast<std::uint64_t*>(&dividend.high));
+            auto carry { int128_addcarry_u64(0, dividend.low, divisor.low, &dividend.low) };
+            std::uint64_t dividend_high_tmp2 {static_cast<std::uint64_t>(dividend.high)};
+            int128_addcarry_u64(carry, static_cast<std::uint64_t>(dividend.high), static_cast<std::uint64_t>(divisor.high), &dividend_high_tmp2);
+            dividend.high = static_cast<high_word_type>(dividend_high_tmp2);
         }
 
         dividend >>= shift_amount;
@@ -426,6 +526,129 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d
     return quotient;
 }
 
+// Moller-Granlund 3/2 reciprocal division for u256 by uint128.
+//
+// Computes (q, r) such that u = q*d + r, where u is 256-bit (3 limbs effective),
+// d is 128-bit (2 limbs, normalized so d.high MSB is set), and v is the
+// 64-bit reciprocal of d.
+//
+// The algorithm requires `d` to be normalized (d.high >= 2^63) and the top
+// 128 bits of `u` to be strictly less than `d` so the quotient fits in one
+// limb. The d128 division flow satisfies both: the divisor is post-
+// expand_significand, so it is in [10^33, 10^34) and shifts to fill 128 bits;
+// the dividend top 128 bits after the same shift remain well under 2^127.
+//
+// The reciprocal v is from reciprocal_2by1 below and must be precomputed.
+// References: Moller & Granlund, "Improved Division by Invariant Integers"
+// (2010), Algorithm 4.
+template <typename U128>
+BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_div_3by2(
+    U128 u_high, std::uint64_t u_low, U128 d, std::uint64_t v, U128& r) noexcept
+{
+    // q = u_high.high * v + u_high  (a 128-bit fused-multiply-add).
+    // int128_umul128_intrinsic returns the low 64 bits and writes the high
+    // 64 bits through the out pointer.
+    std::uint64_t q_high{};
+    std::uint64_t q_low {int128_umul128_intrinsic(u_high.high, v, &q_high)};
+    // Add u_high to (q_high, q_low). Carry from low into high.
+    {
+        const std::uint64_t low_sum {q_low + u_high.low};
+        const unsigned char carry_into_high {static_cast<unsigned char>(low_sum < q_low ? 1U : 0U)};
+        q_low = low_sum;
+        q_high += u_high.high + carry_into_high;
+    }
+
+    // r1 = u_high.low - q_high * d.high  (single 64-bit limb)
+    const std::uint64_t r1 {u_high.low - q_high * d.high};
+
+    // (t_high, t_low) = q_high * d.low (full 128-bit product)
+    std::uint64_t t_high{};
+    const std::uint64_t t_low {int128_umul128_intrinsic(q_high, d.low, &t_high)};
+
+    // r = (r1, u_low) - (t_high, t_low) - d
+    std::uint64_t r_low {u_low};
+    std::uint64_t r_high {r1};
+    unsigned char borrow {int128_subborrow_u64(0, r_low, t_low, &r_low)};
+    int128_subborrow_u64(borrow, r_high, t_high, &r_high);
+    borrow = int128_subborrow_u64(0, r_low, d.low, &r_low);
+    int128_subborrow_u64(borrow, r_high, d.high, &r_high);
+
+    // q' = q_high + 1
+    std::uint64_t quotient {q_high + 1U};
+
+    // First correction: if r_high >= q_low, q'-- and r += d.
+    if (r_high >= q_low)
+    {
+        --quotient;
+        const unsigned char carry {int128_addcarry_u64(0, r_low, d.low, &r_low)};
+        int128_addcarry_u64(carry, r_high, d.high, &r_high);
+    }
+
+    // Second correction (very rare): if r >= d, q'++ and r -= d.
+    const bool r_ge_d {r_high > d.high || (r_high == d.high && r_low >= d.low)};
+    if (BOOST_DECIMAL_DETAIL_INT128_UNLIKELY(r_ge_d))
+    {
+        ++quotient;
+        const unsigned char b2 {int128_subborrow_u64(0, r_low, d.low, &r_low)};
+        int128_subborrow_u64(b2, r_high, d.high, &r_high);
+    }
+
+    r.low = r_low;
+    r.high = static_cast<decltype(r.high)>(r_high);
+    return quotient;
+}
+
+// Precompute the 64-bit reciprocal for a normalized 128-bit divisor.
+// Returns v such that floor((2^192 - 1) / d) = 2^128 + v.
+// Precondition: d.high has its MSB set (d >= 2^127).
+// Reference: Moller & Granlund 2010, Algorithm 2.
+template <typename U128>
+BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept
+{
+    // v_1 = floor((2^128 - 1) / d.high) - 2^64, computed via one 128/64 divide.
+    // The trick: divq(UINT64_MAX - d.high, UINT64_MAX, d.high) sidesteps the
+    // overflow trap (the divq precondition that high < divisor) because
+    // d.high >= 2^63 guarantees UINT64_MAX - d.high < d.high.
+    std::uint64_t dummy{};
+    std::uint64_t v {int128_udiv128_intrinsic(~d.high, UINT64_MAX, d.high, &dummy)};
+
+    // p = d.high * v + d.low (modular, low 64 bits only; the high part is
+    // implicit in the residue analysis below).
+    std::uint64_t p {d.high * v + d.low};
+
+    // Adjust if the addition wrapped (p < d.low).
+    if (p < d.low)
+    {
+        --v;
+        if (p >= d.high)
+        {
+            --v;
+            p -= d.high;
+        }
+        p -= d.high;
+    }
+
+    // (t_high, t_low) = v * d.low.
+    // int128_umul128_intrinsic returns the low limb and writes the high
+    // limb through the out pointer.
+    std::uint64_t t_high{};
+    const std::uint64_t t_low {int128_umul128_intrinsic(v, d.low, &t_high)};
+
+    // p += t_high (with overflow detection)
+    const std::uint64_t p_new {p + t_high};
+    if (p_new < p)
+    {
+        // Overflow; adjust.
+        --v;
+        // If (p_new, t_low) >= d, decrement again.
+        if (p_new > d.high || (p_new == d.high && t_low >= d.low))
+        {
+            --v;
+        }
+    }
+    return v;
+}
+
 #endif
 
 } // namespace impl
@@ -436,7 +659,7 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d
 template <typename T>
 BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient) noexcept
 {
-    #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION)
+    #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION)
 
     if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs))
     {
@@ -444,7 +667,7 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE
 
         quotient.high = static_cast<high_word_type>(static_cast<std::uint64_t>(lhs.high) / rhs);
         auto remainder {static_cast<std::uint64_t>(lhs.high) % rhs};
-        quotient.low = _udiv128(remainder, lhs.low, rhs, &remainder);
+        quotient.low = impl::int128_udiv128_intrinsic(remainder, lhs.low, rhs, &remainder);
         return;
     }
 
@@ -472,7 +695,7 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE
 template <typename T>
 BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient, T& remainder) noexcept
 {
-    #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION)
+    #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION)
 
     if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs))
     {
@@ -480,7 +703,7 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE
 
         quotient.high = static_cast<high_word_type>(static_cast<std::uint64_t>(lhs.high) / rhs);
         remainder.low = static_cast<std::uint64_t>(lhs.high) % rhs;
-        quotient.low = _udiv128(remainder.low, lhs.low, rhs, &remainder.low);
+        quotient.low = impl::int128_udiv128_intrinsic(remainder.low, lhs.low, rhs, &remainder.low);
         return;
     }
 
@@ -531,14 +754,14 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE
 {
     BOOST_DECIMAL_DETAIL_INT128_ASSUME(divisor != static_cast<T>(0));
 
-    #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
+    #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
 
-    BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(!std::numeric_limits<T>::is_signed)
+    BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(std::is_unsigned<decltype(T{}.high)>::value)
     {
         if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(dividend))
         {
             T remainder{};
-            return impl::div_mod_msvc<false>(dividend, divisor, remainder);
+            return impl::div_mod_intrinsic<false>(dividend, divisor, remainder);
         }
     }
 
@@ -561,14 +784,14 @@ template <typename T>
 BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divisor, T& remainder) noexcept
 {
     BOOST_DECIMAL_DETAIL_INT128_ASSUME(divisor != static_cast<T>(0));
-    
-    #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
 
-    BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(!std::numeric_limits<T>::is_signed)
+    #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+
+    BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(std::is_unsigned<decltype(T{}.high)>::value)
     {
         if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(dividend))
         {
-            return impl::div_mod_msvc<true>(dividend, divisor, remainder);
+            return impl::div_mod_intrinsic<true>(dividend, divisor, remainder);
         }
     }
 

From 337eb77d53ced85a9b42e409ed00211d827b9e56 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 10 Jun 2026 08:56:38 -0400
Subject: [PATCH 5/8] Update impls

---
 include/boost/decimal/decimal128_t.hpp      | 28 ++++++-----------
 include/boost/decimal/decimal_fast128_t.hpp | 33 +++++----------------
 include/boost/decimal/decimal_fast64_t.hpp  | 20 +------------
 3 files changed, 17 insertions(+), 64 deletions(-)

diff --git a/include/boost/decimal/decimal128_t.hpp b/include/boost/decimal/decimal128_t.hpp
index adc3ec3fa..0579b11aa 100644
--- a/include/boost/decimal/decimal128_t.hpp
+++ b/include/boost/decimal/decimal128_t.hpp
@@ -1606,14 +1606,14 @@ std::ostream& operator<<( std::ostream& os, boost::decimal::detail::builtin_uint
 
 BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_div_impl(const decimal128_t& lhs, const decimal128_t& rhs, decimal128_t& q, decimal128_t& r) noexcept -> void
 {
+    const bool sign {lhs.isneg() != rhs.isneg()};
+
     #ifndef BOOST_DECIMAL_FAST_MATH
     // Check pre-conditions
     constexpr decimal128_t zero {0, 0};
     constexpr decimal128_t nan {from_bits(detail::d128_nan_mask)};
     constexpr decimal128_t inf {from_bits(detail::d128_inf_mask)};
 
-    const bool sign {lhs.isneg() != rhs.isneg()};
-
     const auto lhs_fp {fpclassify(lhs)};
     const auto rhs_fp {fpclassify(rhs)};
 
@@ -1711,11 +1711,7 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_div_impl(const decimal128_t& lhs, const d
               << "\nexp rhs: " << exp_rhs << std::endl;
     #endif
 
-    detail::decimal128_t_components q_components {};
-
-    detail::d128_generic_div_impl(lhs_components, rhs.to_components(), q_components);
-
-    q = decimal128_t(q_components.sig, q_components.exp, q_components.sign);
+    q = detail::d128_generic_div_impl<decimal128_t>(lhs_components, rhs.to_components(), sign);
 }
 
 #ifdef _MSC_VER
@@ -2010,13 +2006,13 @@ template <typename Integer>
 BOOST_DECIMAL_CUDA_CONSTEXPR auto operator/(const decimal128_t lhs, const Integer rhs) noexcept
     BOOST_DECIMAL_REQUIRES_RETURN(detail::is_integral_v, Integer, decimal128_t)
 {
+    const bool sign {lhs.isneg() != (rhs < 0)};
+
     #ifndef BOOST_DECIMAL_FAST_MATH
     // Check pre-conditions
     constexpr decimal128_t zero {0, 0};
     constexpr decimal128_t inf {from_bits(detail::d128_inf_mask)};
 
-    const bool sign {lhs.isneg() != (rhs < 0)};
-
     const auto lhs_fp {fpclassify(lhs)};
 
     switch (lhs_fp)
@@ -2041,24 +2037,21 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator/(const decimal128_t lhs, const Intege
     detail::expand_significand<decimal128_t>(lhs_components.sig, lhs_components.exp);
 
     const detail::decimal128_t_components rhs_components {detail::make_positive_unsigned(rhs), 0, rhs < 0};
-    detail::decimal128_t_components q_components {};
 
-    detail::d128_generic_div_impl(lhs_components, rhs_components, q_components);
-
-    return decimal128_t(q_components.sig, q_components.exp, q_components.sign);
+    return detail::d128_generic_div_impl<decimal128_t>(lhs_components, rhs_components, sign);
 }
 
 template <typename Integer>
 BOOST_DECIMAL_CUDA_CONSTEXPR auto operator/(const Integer lhs, const decimal128_t rhs) noexcept
     BOOST_DECIMAL_REQUIRES_RETURN(detail::is_integral_v, Integer, decimal128_t)
 {
+    const bool sign {(lhs < 0) != rhs.isneg()};
+
     #ifndef BOOST_DECIMAL_FAST_MATH
     // Check pre-conditions
     constexpr decimal128_t zero {0, 0};
     constexpr decimal128_t inf {from_bits(detail::d128_inf_mask)};
 
-    const bool sign {(lhs < 0) != rhs.isneg()};
-
     const auto rhs_fp {fpclassify(rhs)};
 
     switch (rhs_fp)
@@ -2078,11 +2071,8 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator/(const Integer lhs, const decimal128_
     detail::expand_significand<decimal128_t>(rhs_components.sig, rhs_components.exp);
 
     const detail::decimal128_t_components lhs_components {detail::make_positive_unsigned(lhs), 0, lhs < 0};
-    detail::decimal128_t_components q_components {};
-
-    detail::d128_generic_div_impl(lhs_components, rhs_components, q_components);
 
-    return decimal128_t(q_components.sig, q_components.exp, q_components.sign);
+    return detail::d128_generic_div_impl<decimal128_t>(lhs_components, rhs_components, sign);
 }
 
 BOOST_DECIMAL_CUDA_CONSTEXPR auto operator%(const decimal128_t& lhs, const decimal128_t& rhs) noexcept -> decimal128_t
diff --git a/include/boost/decimal/decimal_fast128_t.hpp b/include/boost/decimal/decimal_fast128_t.hpp
index 052282b18..109ec686a 100644
--- a/include/boost/decimal/decimal_fast128_t.hpp
+++ b/include/boost/decimal/decimal_fast128_t.hpp
@@ -1399,20 +1399,7 @@ constexpr auto d128f_div_impl(const decimal_fast128_t& lhs, const decimal_fast12
     static_cast<void>(r);
     #endif
 
-    #ifdef BOOST_DECIMAL_DEBUG
-    std::cerr << "sig lhs: " << sig_lhs
-              << "\nexp lhs: " << exp_lhs
-              << "\nsig rhs: " << sig_rhs
-              << "\nexp rhs: " << exp_rhs << std::endl;
-    #endif
-
-    constexpr auto ten_pow_precision {detail::pow10(int128::uint128_t(detail::precision_v<decimal_fast128_t>))};
-    const auto big_sig_lhs {detail::umul256(lhs.significand_, ten_pow_precision)};
-
-    const auto res_sig {big_sig_lhs / rhs.significand_};
-    const auto res_exp {lhs.biased_exponent() - rhs.biased_exponent() - detail::precision_v<decimal_fast128_t>};
-
-    q = decimal_fast128_t(static_cast<int128::uint128_t>(res_sig), res_exp, sign);
+    q = detail::d128_generic_div_impl<decimal_fast128_t>(lhs.to_components(), rhs.to_components(), sign);
 }
 
 constexpr auto operator/(const decimal_fast128_t& lhs, const decimal_fast128_t& rhs) noexcept -> decimal_fast128_t
@@ -1428,13 +1415,13 @@ template <typename Integer>
 constexpr auto operator/(const decimal_fast128_t& lhs, const Integer rhs) noexcept
     BOOST_DECIMAL_REQUIRES_RETURN(detail::is_integral_v, Integer, decimal_fast128_t)
 {
+    const bool sign {lhs.isneg() != (rhs < 0)};
+
     #ifndef BOOST_DECIMAL_FAST_MATH
     // Check pre-conditions
     constexpr decimal_fast128_t zero {0, 0};
     constexpr decimal_fast128_t inf {direct_init_d128(detail::d128_fast_inf, 0, false)};
 
-    const bool sign {lhs.isneg() != (rhs < 0)};
-
     const auto lhs_fp {fpclassify(lhs)};
 
     switch (lhs_fp)
@@ -1459,24 +1446,21 @@ constexpr auto operator/(const decimal_fast128_t& lhs, const Integer rhs) noexce
 
     const auto rhs_sig {detail::make_positive_unsigned(rhs)};
     const detail::decimal_fast128_t_components rhs_components {rhs_sig, 0, rhs < 0};
-    detail::decimal_fast128_t_components q_components {};
-
-    detail::d128_generic_div_impl(lhs_components, rhs_components, q_components);
 
-    return {q_components.sig, q_components.exp, q_components.sign};
+    return detail::d128_generic_div_impl<decimal_fast128_t>(lhs_components, rhs_components, sign);
 }
 
 template <typename Integer>
 constexpr auto operator/(const Integer lhs, const decimal_fast128_t& rhs) noexcept
     BOOST_DECIMAL_REQUIRES_RETURN(detail::is_integral_v, Integer, decimal_fast128_t)
 {
+    const bool sign {(lhs < 0) != rhs.isneg()};
+
     #ifndef BOOST_DECIMAL_FAST_MATH
     // Check pre-conditions
     constexpr decimal_fast128_t zero {0, 0};
     constexpr decimal_fast128_t inf {direct_init_d128(detail::d128_fast_inf, 0, false)};
 
-    const bool sign {(lhs < 0) != rhs.isneg()};
-
     const auto rhs_fp {fpclassify(rhs)};
 
     switch (rhs_fp)
@@ -1494,11 +1478,8 @@ constexpr auto operator/(const Integer lhs, const decimal_fast128_t& rhs) noexce
 
     const detail::decimal_fast128_t_components lhs_components {detail::make_positive_unsigned(lhs), 0, lhs < 0};
     const detail::decimal_fast128_t_components rhs_components {rhs.significand_, rhs.biased_exponent(), rhs.isneg()};
-    detail::decimal_fast128_t_components q_components {};
-
-    detail::d128_generic_div_impl(lhs_components, rhs_components, q_components);
 
-    return {q_components.sig, q_components.exp, q_components.sign};
+    return detail::d128_generic_div_impl<decimal_fast128_t>(lhs_components, rhs_components, sign);
 }
 
 constexpr auto operator%(const decimal_fast128_t& lhs, const decimal_fast128_t& rhs) noexcept -> decimal_fast128_t
diff --git a/include/boost/decimal/decimal_fast64_t.hpp b/include/boost/decimal/decimal_fast64_t.hpp
index 6f4046098..07deba724 100644
--- a/include/boost/decimal/decimal_fast64_t.hpp
+++ b/include/boost/decimal/decimal_fast64_t.hpp
@@ -1521,25 +1521,7 @@ constexpr auto d64_fast_div_impl(const decimal_fast64_t& lhs, const decimal_fast
     static_cast<void>(r);
     #endif
 
-    #ifdef BOOST_DECIMAL_DEBUG
-    std::cerr << "sig lhs: " << sig_lhs
-              << "\nexp lhs: " << exp_lhs
-              << "\nsig rhs: " << sig_rhs
-              << "\nexp rhs: " << exp_rhs << std::endl;
-    #endif
-
-    using unsigned_int128_type = boost::int128::uint128_t;
-
-    // If rhs is greater than we need to offset the significands to get the correct values
-    // e.g. 4/8 is 0 but 40/8 yields 5 in integer maths
-    constexpr auto offset {std::numeric_limits<unsigned_int128_type>::digits10 - detail::precision_v<decimal_fast64_t>};
-    constexpr auto tens_needed {detail::pow10(static_cast<unsigned_int128_type>(offset))};
-    const auto big_sig_lhs {static_cast<unsigned_int128_type>(lhs.significand_) * tens_needed};
-
-    const auto res_sig {big_sig_lhs / rhs.significand_};
-    const auto res_exp {(lhs.biased_exponent() - offset) - rhs.biased_exponent()};
-
-    q = decimal_fast64_t{res_sig, res_exp, sign};
+    q = detail::d64_generic_div_impl<decimal_fast64_t>(lhs.to_components(), rhs.to_components(), sign);
 }
 
 constexpr auto operator/(const decimal_fast64_t& lhs, const decimal_fast64_t& rhs) noexcept -> decimal_fast64_t

From 5bcaa5d77a3eb5f3407dcf66e7a599632d0f18b2 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 17 Jun 2026 13:13:47 -0400
Subject: [PATCH 6/8] Mark function as constexpr

---
 include/boost/decimal/detail/u256.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp
index 58645df88..5959ae03e 100644
--- a/include/boost/decimal/detail/u256.hpp
+++ b/include/boost/decimal/detail/u256.hpp
@@ -1135,7 +1135,7 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u
 // path) or when the dividend's top 128 bits are not strictly less than the
 // normalized divisor (the 3/2 algorithm's precondition for a single-limb
 // quotient).
-BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128(
+constexpr BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128(
     u256 u, int128::uint128_t d,
     int128::uint128_t& q, int128::uint128_t& r) noexcept
 {

From b9e731ee243c6c8064a374fd648648681b7879b5 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 17 Jun 2026 15:48:52 -0400
Subject: [PATCH 7/8] Mark function as constexpr

---
 include/boost/decimal/detail/int128/detail/common_div.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/boost/decimal/detail/int128/detail/common_div.hpp b/include/boost/decimal/detail/int128/detail/common_div.hpp
index d5d5c828b..361dd8dc4 100644
--- a/include/boost/decimal/detail/int128/detail/common_div.hpp
+++ b/include/boost/decimal/detail/int128/detail/common_div.hpp
@@ -603,7 +603,7 @@ BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_div_3by2(
 // Precondition: d.high has its MSB set (d >= 2^127).
 // Reference: Moller & Granlund 2010, Algorithm 2.
 template <typename U128>
-BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept
+constexpr BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept
 {
     // v_1 = floor((2^128 - 1) / d.high) - 2^64, computed via one 128/64 divide.
     // The trick: divq(UINT64_MAX - d.high, UINT64_MAX, d.high) sidesteps the

From ac5c2acd2d5e165b55cc66fea3a9cae6b175fc33 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 18 Jun 2026 08:34:39 -0400
Subject: [PATCH 8/8] More x64 fixes

---
 .../detail/int128/detail/common_div.hpp       |  2 +-
 include/boost/decimal/detail/u256.hpp         | 29 +++++++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/include/boost/decimal/detail/int128/detail/common_div.hpp b/include/boost/decimal/detail/int128/detail/common_div.hpp
index 361dd8dc4..d5d5c828b 100644
--- a/include/boost/decimal/detail/int128/detail/common_div.hpp
+++ b/include/boost/decimal/detail/int128/detail/common_div.hpp
@@ -603,7 +603,7 @@ BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_div_3by2(
 // Precondition: d.high has its MSB set (d >= 2^127).
 // Reference: Moller & Granlund 2010, Algorithm 2.
 template <typename U128>
-constexpr BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept
+BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept
 {
     // v_1 = floor((2^128 - 1) / d.high) - 2^64, computed via one 128/64 divide.
     // The trick: divq(UINT64_MAX - d.high, UINT64_MAX, d.high) sidesteps the
diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp
index 5959ae03e..cffe1aa5d 100644
--- a/include/boost/decimal/detail/u256.hpp
+++ b/include/boost/decimal/detail/u256.hpp
@@ -1135,7 +1135,7 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u
 // path) or when the dividend's top 128 bits are not strictly less than the
 // normalized divisor (the 3/2 algorithm's precondition for a single-limb
 // quotient).
-constexpr BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128(
+BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128(
     u256 u, int128::uint128_t d,
     int128::uint128_t& q, int128::uint128_t& r) noexcept
 {
@@ -1188,6 +1188,22 @@ constexpr BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128(
 
 #endif // BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
 
+// True when the divisor fits in 128 bits, i.e. the MG u256-by-u128 fast path
+// is applicable. A u256 divisor only qualifies when its upper 128 bits are
+// clear; static_cast<uint128_t> would otherwise truncate it silently and the
+// fast path would divide by the wrong value. Narrower integer types always
+// qualify.
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR bool divisor_fits_u128(const u256& rhs) noexcept
+{
+    return rhs.bytes[2] == UINT64_C(0) && rhs.bytes[3] == UINT64_C(0);
+}
+
+template <typename UnsignedInteger>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR bool divisor_fits_u128(const UnsignedInteger&) noexcept
+{
+    return true;
+}
+
 template <typename UnsignedInteger>
 BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u256& lhs, const UnsignedInteger& rhs) noexcept
 {
@@ -1200,11 +1216,15 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u
         return u256{};
     }
 
-    #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+    #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION)
     // MG 3/2 fast path. Replaces ~3 hardware divides in the 32-bit Knuth-D
     // outer loop with one reciprocal-compute plus two cheap 3/2 inner steps.
     // Bails out (returns false) for inputs that don't fit the algorithm's
     // single-limb-quotient shape; those fall through to Knuth-D below.
+    // Runtime-only: the MG helpers call hardware intrinsics that are not
+    // usable in a constant expression, so this path is gated behind
+    // IS_CONSTANT_EVALUATED (and compiled out when detection is unavailable).
+    if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs) && divisor_fits_u128(rhs))
     {
         const int128::uint128_t rhs_u128 {static_cast<int128::uint128_t>(rhs)};
         int128::uint128_t mg_q{};
@@ -1242,10 +1262,13 @@ struct u256_divmod_result
 template <typename UnsignedInteger>
 BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_mod(const u256& lhs, const UnsignedInteger& rhs) noexcept -> u256_divmod_result
 {
-    #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128
+    #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION)
     // MG 3/2 fast path for u256/uint128. See default_div for rationale.
     // Single-limb divisors fall through to the 32-bit Knuth-D path below,
     // which has a dedicated n==1 short-circuit.
+    // Runtime-only: gated behind IS_CONSTANT_EVALUATED because the MG helpers
+    // call hardware intrinsics that are not usable in a constant expression.
+    if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs) && divisor_fits_u128(rhs))
     {
         const int128::uint128_t rhs_u128 {static_cast<int128::uint128_t>(rhs)};
         if (rhs_u128.high != 0U)