From 66c0c0292e9017c4281ee3f2af90dce33dc82f7a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 10 Jun 2026 08:55:02 -0400 Subject: [PATCH 1/8] Add fast path for 256/128 division --- include/boost/decimal/detail/u256.hpp | 110 ++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp index aa524af15..58645df88 100644 --- a/include/boost/decimal/detail/u256.hpp +++ b/include/boost/decimal/detail/u256.hpp @@ -1120,6 +1120,74 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u return quotient; } +#ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 + +// MG 3/2 fast path for u256 divided by a uint128 divisor. +// +// Normalizes the divisor so its MSB is set, applies the same shift to the +// dividend (which never overflows past 256 bits for the d128 division flow +// -- the top uint64 of `big_sig = lhs.sig * 10^34` is at most ~34 bits, and +// the shift amount is at most ~17), runs two chained 3/2 inner divides, and +// un-normalizes the remainder. +// +// Returns true when the fast path applied (`q` and `r` populated), false +// when the divisor's high half is zero (caller should use the single-limb +// path) or when the dividend's top 128 bits are not strictly less than the +// normalized divisor (the 3/2 algorithm's precondition for a single-limb +// quotient). +BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128( + u256 u, int128::uint128_t d, + int128::uint128_t& q, int128::uint128_t& r) noexcept +{ + if (d.high == 0U) + { + return false; // single-limb divisor: caller takes the fast 4x uint128/uint64 path + } + + // Normalize d so its MSB is set. + const int shift_amount {int128::detail::countl_zero(d.high)}; + if (shift_amount > 0) + { + d <<= shift_amount; + const int complement {64 - shift_amount}; + // Shift u left by `shift_amount` across all 4 limbs. If the top limb + // would overflow past 256 bits, bail out; the caller will use the + // existing Knuth-D path. + if ((u.bytes[3] >> complement) != 0U) + { + return false; + } + u.bytes[3] = (u.bytes[3] << shift_amount) | (u.bytes[2] >> complement); + u.bytes[2] = (u.bytes[2] << shift_amount) | (u.bytes[1] >> complement); + u.bytes[1] = (u.bytes[1] << shift_amount) | (u.bytes[0] >> complement); + u.bytes[0] = (u.bytes[0] << shift_amount); + } + + // Precondition: top 128 bits of u must be strictly less than d (the 3/2 + // algorithm computes a one-limb quotient and breaks otherwise). When this + // doesn't hold the result would need a 3-limb quotient and we fall back. + const int128::uint128_t u_top {u.bytes[3], u.bytes[2]}; + if (!(u_top < d)) + { + return false; + } + + const std::uint64_t v {int128::detail::impl::mg32_reciprocal_2by1(d)}; + + int128::uint128_t r1{}; + const std::uint64_t q_high {int128::detail::impl::mg32_div_3by2(u_top, u.bytes[1], d, v, r1)}; + + int128::uint128_t r2{}; + const std::uint64_t q_low {int128::detail::impl::mg32_div_3by2(r1, u.bytes[0], d, v, r2)}; + + q = int128::uint128_t{q_high, q_low}; + // Un-normalize the remainder. + r = (shift_amount > 0) ? (r2 >> shift_amount) : r2; + return true; +} + +#endif // BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 + template BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u256& lhs, const UnsignedInteger& rhs) noexcept { @@ -1132,6 +1200,25 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u return u256{}; } + #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 + // MG 3/2 fast path. Replaces ~3 hardware divides in the 32-bit Knuth-D + // outer loop with one reciprocal-compute plus two cheap 3/2 inner steps. + // Bails out (returns false) for inputs that don't fit the algorithm's + // single-limb-quotient shape; those fall through to Knuth-D below. + { + const int128::uint128_t rhs_u128 {static_cast(rhs)}; + int128::uint128_t mg_q{}; + int128::uint128_t mg_r{}; + if (mg32_u256_by_u128(lhs, rhs_u128, mg_q, mg_r)) + { + u256 result{}; + result.bytes[0] = mg_q.low; + result.bytes[1] = mg_q.high; + return result; + } + } + #endif + std::uint32_t u[8] {}; std::uint32_t v[8] {}; std::uint32_t q[8] {}; @@ -1155,6 +1242,29 @@ struct u256_divmod_result template BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_mod(const u256& lhs, const UnsignedInteger& rhs) noexcept -> u256_divmod_result { + #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 + // MG 3/2 fast path for u256/uint128. See default_div for rationale. + // Single-limb divisors fall through to the 32-bit Knuth-D path below, + // which has a dedicated n==1 short-circuit. + { + const int128::uint128_t rhs_u128 {static_cast(rhs)}; + if (rhs_u128.high != 0U) + { + int128::uint128_t mg_q{}; + int128::uint128_t mg_r{}; + if (mg32_u256_by_u128(lhs, rhs_u128, mg_q, mg_r)) + { + u256_divmod_result out{}; + out.quotient.bytes[0] = mg_q.low; + out.quotient.bytes[1] = mg_q.high; + out.remainder.bytes[0] = mg_r.low; + out.remainder.bytes[1] = mg_r.high; + return out; + } + } + } + #endif + std::uint32_t u[8] {}; std::uint32_t v[8] {}; std::uint32_t q[8] {}; From 3fc7eccad5022b6b9e500cc4fda6f25dd23afc86 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 10 Jun 2026 08:55:45 -0400 Subject: [PATCH 2/8] Gate architecture based quick div --- .../detail/int128/detail/uint128_imp.hpp | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/include/boost/decimal/detail/int128/detail/uint128_imp.hpp b/include/boost/decimal/detail/int128/detail/uint128_imp.hpp index 8e1a26d57..3f430fecb 100644 --- a/include/boost/decimal/detail/int128/detail/uint128_imp.hpp +++ b/include/boost/decimal/detail/int128/detail/uint128_imp.hpp @@ -3028,6 +3028,30 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__) else { + // On x86-64 GCC/Clang the __int128 builtin lowers to __udivti3 which + // runs Knuth-D (~70-90c). Route runtime calls to the Moller-Granlund + // 2x2 divide that backs the MSVC path (~25-30c). constexpr context + // still uses the builtin (inline asm is not constexpr). + #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) && (defined(__GNUC__) || defined(__clang__)) + if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs)) + { + // uint128/uint64 short-circuit: div_mod_intrinsic's 2x2 path + // forces countl_zero(divisor.high)=64 -> shift by 64, producing a + // correct but extra-work result. one_word_div hits a single divq. + if (rhs.high == 0U) + { + if (lhs.high == 0U) + { + return {0, lhs.low / rhs.low}; + } + uint128_t quotient {}; + detail::one_word_div(lhs, rhs.low, quotient); + return quotient; + } + uint128_t remainder{}; + return detail::impl::div_mod_intrinsic(lhs, rhs, remainder); + } + #endif return static_cast(static_cast(lhs) / static_cast(rhs)); } #else @@ -3239,6 +3263,28 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__) else { + // See operator/ for the rationale on routing x86-64 GCC/Clang + // runtime calls through the Moller-Granlund 2x2 divide. + #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) && (defined(__GNUC__) || defined(__clang__)) + if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs)) + { + // uint128/uint64 short-circuit (see operator/ for the rationale). + if (rhs.high == 0U) + { + if (lhs.high == 0U) + { + return {0, lhs.low % rhs.low}; + } + uint128_t quotient {}; + uint128_t remainder {}; + detail::one_word_div(lhs, rhs.low, quotient, remainder); + return remainder; + } + uint128_t remainder{}; + detail::impl::div_mod_intrinsic(lhs, rhs, remainder); + return remainder; + } + #endif return static_cast(static_cast(lhs) % static_cast(rhs)); } #else From b7b2d0254950080bf5309f5486ef78c8d6ccbdea Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 10 Jun 2026 08:56:02 -0400 Subject: [PATCH 3/8] Unify division impl --- include/boost/decimal/detail/div_impl.hpp | 369 +++++++++++++++++++--- 1 file changed, 327 insertions(+), 42 deletions(-) diff --git a/include/boost/decimal/detail/div_impl.hpp b/include/boost/decimal/detail/div_impl.hpp index cf6f70050..046210c5b 100644 --- a/include/boost/decimal/detail/div_impl.hpp +++ b/include/boost/decimal/detail/div_impl.hpp @@ -5,7 +5,12 @@ #ifndef BOOST_DECIMAL_DETAIL_DIV_IMPL_HPP #define BOOST_DECIMAL_DETAIL_DIV_IMPL_HPP -#include +#include +#include +#include +#include +#include +#include #include #include "int128.hpp" @@ -18,76 +23,356 @@ namespace boost { namespace decimal { namespace detail { +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4127) // Conditional expression is constant (pre-C++17 if-constexpr fallback) +#endif + +namespace impl { + +// Default-rounding gate. The inline round-half-to-even (RHTE) in the +// div_finalize_* helpers assumes the active rounding mode is +// fe_dec_to_nearest. Other modes route through the constructor handoff +// (which dispatches to fenv_round honoring the runtime mode). +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_default_rounding(const Anchor& anchor) noexcept -> bool +{ + static_cast(anchor); + bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(anchor)) + { + default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest); + } + #endif + return default_rounding; +} + +// Division finalization with inline round-half-to-even. The driver computes +// q = (lhs.sig * 10^p) / rhs.sig and r = (lhs.sig * 10^p) % rhs.sig where +// both operand significands are at full precision p. The natural remainder +// r encodes the exact sticky bit needed for correct RHTE. +// +// With both operands in [10^(p-1), 10^p), the quotient is in +// [10^(p-1), 10^(p+1)), i.e. exactly p or p+1 digits. +// +// Case 1 (q < 10^p): q already has p digits. Round up iff +// 2r > divisor (computed as r > divisor - r to +// avoid overflow when divisor is +// near the type's max) +// OR (2r == divisor AND q is odd). +// +// Case 2 (q >= 10^p): q has p+1 digits, shrink by one digit. Let +// q' = q/10, r_q = q%10. The true fractional part is +// f = r_q/10 + r/(10*divisor). Round up iff +// r_q > 5 +// OR (r_q == 5 AND r > 0) +// OR (r_q == 5 AND r == 0 AND q' is odd). +// +// In either case the post-rounding carry q' == 10^p shifts down to 10^(p-1) +// and bumps the exponent by one more digit. + +// d32/fast32 finalizer. Dividend = lhs.sig * 10^7 fits in uint64; quotient is +// at most 10^8 - 1 which fits in uint32; remainder is bounded by the divisor +// which fits in uint32 (rhs.sig < 10^7). +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_finalize_u64( + std::uint64_t q, std::uint64_t r, std::uint32_t divisor, + ExpType result_exp, bool sign) noexcept -> ReturnType +{ + constexpr auto ten_to_p {pow10(static_cast(detail::precision_v))}; + constexpr auto ten_to_p_minus_1 {pow10(static_cast(detail::precision_v - 1))}; + + int extra {0}; + + if (q >= ten_to_p) + { + extra = 1; + const auto r_q {static_cast(q % UINT64_C(10))}; + q /= UINT64_C(10); + + const bool round_up {r_q > 5U || + (r_q == 5U && (r != UINT64_C(0) || (q & UINT64_C(1)) != UINT64_C(0)))}; + if (round_up) + { + ++q; + if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p)) + { + q = ten_to_p_minus_1; + ++extra; + } + } + } + else + { + const auto half_div {static_cast(divisor) - r}; + const bool round_up {r > half_div || (r == half_div && (q & UINT64_C(1)) != UINT64_C(0))}; + if (round_up) + { + ++q; + if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p)) + { + q = ten_to_p_minus_1; + ++extra; + } + } + } + + using sig_type = typename ReturnType::significand_type; + return pack_in_range(static_cast(q), + result_exp + static_cast(extra), + sign); +} + +// d64/fast64 finalizer. Quotient and remainder both fit in uint64 because +// quotient is at most 10^17 - 1 < 2^57 and remainder is bounded by the +// uint64 divisor. +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_finalize_u128( + std::uint64_t q, std::uint64_t r, std::uint64_t divisor, + ExpType result_exp, bool sign) noexcept -> ReturnType +{ + constexpr auto ten_to_p {pow10(static_cast(detail::precision_v))}; + constexpr auto ten_to_p_minus_1 {pow10(static_cast(detail::precision_v - 1))}; + + int extra {0}; + + if (q >= ten_to_p) + { + extra = 1; + const auto r_q {static_cast(q % UINT64_C(10))}; + q /= UINT64_C(10); + + const bool round_up {r_q > 5U || + (r_q == 5U && (r != UINT64_C(0) || (q & UINT64_C(1)) != UINT64_C(0)))}; + if (round_up) + { + ++q; + if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p)) + { + q = ten_to_p_minus_1; + ++extra; + } + } + } + else + { + const auto half_div {divisor - r}; + const bool round_up {r > half_div || (r == half_div && (q & UINT64_C(1)) != UINT64_C(0))}; + if (round_up) + { + ++q; + if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p)) + { + q = ten_to_p_minus_1; + ++extra; + } + } + } + + using sig_type = typename ReturnType::significand_type; + return pack_in_range(static_cast(q), + result_exp + static_cast(extra), + sign); +} + +// d128/fast128 finalizer. Quotient and remainder both fit in uint128 because +// quotient is at most 10^35 - 1 (< 2^117) and remainder is bounded by the +// uint128 divisor (< 10^34). +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_finalize_u256( + int128::uint128_t q, int128::uint128_t r, int128::uint128_t divisor, + ExpType result_exp, bool sign) noexcept -> ReturnType +{ + constexpr auto ten_to_p {pow10(int128::uint128_t{static_cast(detail::precision_v)})}; + constexpr auto ten_to_p_minus_1 {pow10(int128::uint128_t{static_cast(detail::precision_v - 1)})}; + + int extra {0}; + + if (q >= ten_to_p) + { + extra = 1; + const auto dr_q {divmod10(q)}; + const auto r_q {static_cast(dr_q.remainder)}; + q = dr_q.quotient; + + const bool round_up {r_q > 5U || + (r_q == 5U && (r != int128::uint128_t{0U} || (q.low & UINT64_C(1)) != UINT64_C(0)))}; + if (round_up) + { + ++q; + if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p)) + { + q = ten_to_p_minus_1; + ++extra; + } + } + } + else + { + const auto half_div {divisor - r}; + const bool round_up {r > half_div || (r == half_div && (q.low & UINT64_C(1)) != UINT64_C(0))}; + if (round_up) + { + ++q; + if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p)) + { + q = ten_to_p_minus_1; + ++extra; + } + } + } + + return pack_in_range(q, + result_exp + static_cast(extra), + sign); +} + +} // namespace impl + +// d32/fast32 division driver. Accepts a decimal type or a components struct +// (both expose to_components()) and dispatches to the inline RHTE fast path +// when the active rounding mode is fe_dec_to_nearest. Non-default rounding +// modes fall back to the original wide-divide + constructor handoff so the +// constructor's coefficient_rounding can dispatch to fenv_round. template BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto generic_div_impl(const T& lhs, const T& rhs) noexcept -> DecimalType { - // If rhs is greater than we need to offset the significands to get the correct values - // e.g. 4/8 is 0 but 40/8 yields 5 in integer maths - // - // By expanding the offset to all the way to the value of numeric_limits::digits10 - // we can recover more of what would become the fraction to achieve better rounding + auto lhs_c {lhs.to_components()}; + auto rhs_c {rhs.to_components()}; - using div_type = std::uint64_t; + detail::expand_significand(lhs_c.sig, lhs_c.exp); + detail::expand_significand(rhs_c.sig, rhs_c.exp); - constexpr auto precision_offset {std::numeric_limits::digits10 - precision}; - constexpr auto ten_pow_offset {detail::pow10(static_cast(precision_offset))}; + const bool sign {lhs_c.sign != rhs_c.sign}; - const auto big_sig_lhs {lhs.full_significand() * ten_pow_offset}; + if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == 0U)) + { + using sig_type = typename decltype(lhs_c)::significand_type; + return DecimalType{sig_type{0U}, lhs_c.exp - rhs_c.exp, sign}; + } - const auto res_sig {big_sig_lhs / rhs.full_significand()}; - const auto res_exp {(lhs.biased_exponent() - precision_offset) - rhs.biased_exponent()}; + if (BOOST_DECIMAL_UNLIKELY(!impl::div_default_rounding(lhs_c.sig))) + { + // Non-default rounding mode: hand the wide quotient to the + // constructor's coefficient_rounding so fenv_round honors the + // runtime mode. The wider offset preserves the original behavior. + constexpr auto wide_offset {std::numeric_limits::digits10 - precision_v}; + constexpr auto wide_tens {pow10(static_cast(wide_offset))}; + const auto wide_sig {static_cast(lhs_c.sig) * wide_tens}; + const auto wide_q {wide_sig / static_cast(rhs_c.sig)}; + const auto wide_exp {(lhs_c.exp - static_cast(wide_offset)) - rhs_c.exp}; + return DecimalType{wide_q, wide_exp, sign}; + } - // IEEE 754-2008: division uses sign(x) XOR sign(y) for all results, including zero - const bool sign {lhs.isneg() != rhs.isneg()}; + constexpr auto ten_to_p {pow10(static_cast(precision_v))}; + const auto big_sig {static_cast(lhs_c.sig) * ten_to_p}; + const auto divisor {static_cast(rhs_c.sig)}; + const auto q {big_sig / divisor}; + const auto r {big_sig - q * divisor}; + const auto res_exp {(lhs_c.exp - static_cast(precision_v)) - rhs_c.exp}; - // Let the constructor handle shrinking it back down and rounding correctly - return DecimalType{res_sig, res_exp, sign}; + return impl::div_finalize_u64(q, r, static_cast(divisor), res_exp, sign); } +// d64/fast64 division driver. Same structure as the d32 driver but uses +// uint128 for the pre-scaled dividend (lhs.sig * 10^16 overflows uint64). +// The quotient is at most 10^17 - 1 < 2^57 so it narrows safely to uint64 +// for the finalizer. template BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_generic_div_impl(const T& lhs, const T& rhs, const bool sign) noexcept -> DecimalType { using unsigned_int128_type = boost::int128::uint128_t; - // If rhs is greater than we need to offset the significands to get the correct values - // e.g. 4/8 is 0 but 40/8 yields 5 in integer maths - constexpr auto offset {std::numeric_limits::digits10 - detail::precision_v}; - constexpr auto tens_needed {detail::pow10(static_cast(offset))}; - const auto big_sig_lhs {static_cast(lhs.sig) * tens_needed}; + auto lhs_c {lhs.to_components()}; + auto rhs_c {rhs.to_components()}; - const auto res_sig {big_sig_lhs / rhs.sig}; - const auto res_exp {(lhs.exp - offset) - rhs.exp}; + detail::expand_significand(lhs_c.sig, lhs_c.exp); + detail::expand_significand(rhs_c.sig, rhs_c.exp); - // Let the constructor handle shrinking it back down and rounding correctly - return DecimalType{res_sig, res_exp, sign}; + if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == 0U)) + { + using sig_type = typename decltype(lhs_c)::significand_type; + return DecimalType{sig_type{0U}, lhs_c.exp - rhs_c.exp, sign}; + } + + if (BOOST_DECIMAL_UNLIKELY(!impl::div_default_rounding(lhs_c.sig))) + { + constexpr auto wide_offset {std::numeric_limits::digits10 - precision_v}; + const auto wide_tens {pow10(static_cast(wide_offset))}; + const auto wide_sig {static_cast(lhs_c.sig) * wide_tens}; + const auto wide_q {wide_sig / static_cast(rhs_c.sig)}; + const auto wide_exp {(lhs_c.exp - static_cast(wide_offset)) - rhs_c.exp}; + return DecimalType{wide_q, wide_exp, sign}; + } + + constexpr auto ten_to_p {static_cast(pow10(static_cast(precision_v)))}; + const auto big_sig {static_cast(lhs_c.sig) * ten_to_p}; + const auto divisor {static_cast(rhs_c.sig)}; + const auto q_wide {big_sig / static_cast(divisor)}; + const auto r_wide {big_sig - q_wide * static_cast(divisor)}; + const auto res_exp {(lhs_c.exp - static_cast(precision_v)) - rhs_c.exp}; + + return impl::div_finalize_u128(static_cast(q_wide.low), + static_cast(r_wide.low), + divisor, + res_exp, + sign); } -template -BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_generic_div_impl(const T& lhs, const T& rhs, T& q) noexcept -> void +// d128/fast128 division driver. The pre-scaled dividend lhs.sig * 10^34 +// needs u256 because the product may reach ~10^68 (well above uint128 max). +// The quotient itself fits in uint128 (<= 10^35) so we narrow before the +// finalizer. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_generic_div_impl(const T& lhs, const T& rhs, const bool sign) noexcept -> DecimalType { - bool sign {lhs.sign != rhs.sign}; + auto lhs_c {lhs.to_components()}; + auto rhs_c {rhs.to_components()}; - constexpr auto ten_pow_precision {pow10(int128::uint128_t(detail::precision_v))}; - const auto big_sig_lhs {detail::umul256(lhs.sig, ten_pow_precision)}; + detail::expand_significand(lhs_c.sig, lhs_c.exp); + detail::expand_significand(rhs_c.sig, rhs_c.exp); - auto res_sig {big_sig_lhs / rhs.sig}; - auto res_exp {lhs.exp - rhs.exp - detail::precision_v}; + if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == int128::uint128_t{0U})) + { + return DecimalType{int128::uint128_t{0U}, lhs_c.exp - rhs_c.exp, sign}; + } - if (res_sig[3] != 0 || res_sig[2] != 0) + if (BOOST_DECIMAL_UNLIKELY(!impl::div_default_rounding(lhs_c.sig))) { - const auto sig_dig {detail::num_digits(res_sig)}; - const auto digit_delta {sig_dig - std::numeric_limits::digits10}; - res_sig /= pow10(int128::uint128_t(digit_delta)); - res_exp += digit_delta; + const auto wide_tens {pow10(int128::uint128_t{static_cast(precision_v)})}; + const auto wide_sig {detail::umul256(lhs_c.sig, wide_tens)}; + auto wide_q {wide_sig / rhs_c.sig}; + auto wide_exp {lhs_c.exp - rhs_c.exp - static_cast(precision_v)}; + + if (wide_q[3] != 0U || wide_q[2] != 0U) + { + const auto sig_dig {detail::num_digits(wide_q)}; + const auto digit_delta {sig_dig - std::numeric_limits::digits10}; + wide_q /= pow10(int128::uint128_t{static_cast(digit_delta)}); + wide_exp += digit_delta; + } + + BOOST_DECIMAL_ASSERT((wide_q[3] | wide_q[2]) == 0U); + return DecimalType{int128::uint128_t{wide_q[1], wide_q[0]}, wide_exp, sign}; } - // IEEE 754-2008: division uses sign(x) XOR sign(y) for all results, including zero, - // so we no longer override `sign` when res_sig is zero. - // Let the constructor handle shrinking it back down and rounding correctly - BOOST_DECIMAL_ASSERT((res_sig[3] | res_sig[2]) == 0U); - q = T {int128::uint128_t{res_sig[1], res_sig[0]}, res_exp, sign}; + constexpr auto ten_to_p {pow10(int128::uint128_t{static_cast(precision_v)})}; + const auto big_sig {detail::umul256(lhs_c.sig, ten_to_p)}; + const auto divisor {rhs_c.sig}; + const auto dr {impl::div_mod(big_sig, divisor)}; + + const int128::uint128_t q {dr.quotient.bytes[1], dr.quotient.bytes[0]}; + const int128::uint128_t r {dr.remainder.bytes[1], dr.remainder.bytes[0]}; + const auto res_exp {lhs_c.exp - rhs_c.exp - static_cast(precision_v)}; + + return impl::div_finalize_u256(q, r, divisor, res_exp, sign); } +#ifdef _MSC_VER +# pragma warning(pop) +#endif + } // namespace detail } // namespace decimal } // namespace boost From 1536ee42960173386131af1f5dce6dc2740f4691 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 10 Jun 2026 08:56:31 -0400 Subject: [PATCH 4/8] Add intrinsic and assembly based fast paths --- .../detail/int128/detail/common_div.hpp | 289 ++++++++++++++++-- 1 file changed, 256 insertions(+), 33 deletions(-) diff --git a/include/boost/decimal/detail/int128/detail/common_div.hpp b/include/boost/decimal/detail/int128/detail/common_div.hpp index 263ecd02c..d5d5c828b 100644 --- a/include/boost/decimal/detail/int128/detail/common_div.hpp +++ b/include/boost/decimal/detail/int128/detail/common_div.hpp @@ -304,17 +304,105 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE return {static_cast(high), low}; } +// HAS_FAST_DIV128 is true on platforms where we expose a hardware 128-by-64 +// division primitive: MSVC x64 via _udiv128, and GCC/Clang on x86-64 via +// inline divq. Both flavors share the same Moller-Granlund-style 2x2 division +// scaffolding below, parameterized over portable umul128/udiv128 helpers. +// +// AArch64 stays on the __udivti3 / Knuth-D path -- UDIV is already 7c/0.5 +// there and the inline-divq port is x86-only. CUDA device code is excluded +// because inline asm and _udiv128 are not available in that environment. #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 +# define BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 +#elif defined(BOOST_DECIMAL_DETAIL_INT128_HAS_INT128) \ + && (defined(__GNUC__) || defined(__clang__)) \ + && defined(__x86_64__) \ + && !defined(__CUDA_ARCH__) +# define BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 +#endif + +#ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 + +// MSVC's _addcarry_u64 takes std::uint64_t* (unsigned __int64*); GCC/Clang's +// _addcarry_u64 takes unsigned long long*, which on LP64 Linux is a distinct +// type from std::uint64_t (typedef'd to unsigned long there). Wrap the macros +// with helpers that accept std::uint64_t* and convert internally. +BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE unsigned char int128_addcarry_u64(unsigned char c, std::uint64_t x, std::uint64_t y, std::uint64_t* out) noexcept +{ +#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) + return BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(c, x, y, out); +#else + unsigned long long tmp {static_cast(*out)}; + const unsigned char result {BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(c, x, y, &tmp)}; + *out = static_cast(tmp); + return result; +#endif +} + +BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE unsigned char int128_subborrow_u64(unsigned char c, std::uint64_t x, std::uint64_t y, std::uint64_t* out) noexcept +{ +#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) + return BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(c, x, y, out); +#else + unsigned long long tmp {static_cast(*out)}; + const unsigned char result {BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(c, x, y, &tmp)}; + *out = static_cast(tmp); + return result; +#endif +} + +// 64x64 -> 128-bit unsigned multiply. MSVC uses the _umul128 intrinsic; +// GCC/Clang use the native __uint128_t multiply which the compiler lowers +// to mul/mulx on x86-64. Neither path is usable in a constant expression +// (callers gate via IS_CONSTANT_EVALUATED), so this is not constexpr. +BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t int128_umul128_intrinsic(std::uint64_t a, std::uint64_t b, std::uint64_t* hi) noexcept +{ +#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) + return _umul128(a, b, hi); +#else + const __uint128_t prod {static_cast<__uint128_t>(a) * b}; + *hi = static_cast(prod >> 64); + return static_cast(prod); +#endif +} + +// 128-bit-by-64-bit unsigned divide. MSVC uses _udiv128; GCC/Clang use +// inline divq. Caller must guarantee `high < divisor` so divq does not +// raise #DE (the Moller-Granlund normalization in div_mod_intrinsic +// below ensures this). Not constexpr because inline asm and _udiv128 +// are runtime-only; div_mod_intrinsic gates its call with +// IS_CONSTANT_EVALUATED. +BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t int128_udiv128_intrinsic(std::uint64_t high, std::uint64_t low, std::uint64_t divisor, std::uint64_t* remainder) noexcept +{ +#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 + return _udiv128(high, low, divisor, remainder); +#else + std::uint64_t q; + std::uint64_t r; + __asm__ ("divq %4" + : "=a"(q), "=d"(r) + : "0"(low), "1"(high), "r"(divisor) + : "cc"); + *remainder = r; + return q; +#endif +} template -BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T divisor, T& remainder) +BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_intrinsic(T dividend, T divisor, T& remainder) { using high_word_type = decltype(T{}.high); // Skip normalization if divisor is already large enough // use direct division and intrinsic - // This is only possible in the unsigned case - BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR (!std::numeric_limits::is_signed) + // This is only possible in the unsigned case. The check uses + // std::is_unsigned on the high-word type rather than + // std::numeric_limits::is_signed to avoid forcing implicit + // instantiation of std::numeric_limits from inside this header (the + // explicit specialization sits later in uint128_imp.hpp; instantiating + // first via numeric_limits here causes clang to reject the + // specialization as "explicit specialization after instantiation"). + BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR (std::is_unsigned::value) { constexpr auto divisor_lower_bound{UINT64_MAX >> 1}; if (divisor.high >= divisor_lower_bound) @@ -324,14 +412,16 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d quotient.low = static_cast(dividend.high / divisor.high); std::uint64_t product0_high{}; - auto product0_low{_umul128(quotient.low, divisor.low, &product0_high)}; + auto product0_low{int128_umul128_intrinsic(quotient.low, divisor.low, &product0_high)}; std::uint64_t product1_high{}; - auto product1_low{_umul128(quotient.low, static_cast(divisor.high), &product1_high)}; + auto product1_low{int128_umul128_intrinsic(quotient.low, static_cast(divisor.high), &product1_high)}; T product{}; product.low = product0_low; - auto carry{BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(0, product0_high, product1_low, reinterpret_cast(&product.high))}; + std::uint64_t product_high_tmp {static_cast(product.high)}; + auto carry{int128_addcarry_u64(0, product0_high, product1_low, &product_high_tmp)}; + product.high = static_cast(product_high_tmp); product1_high += static_cast(carry); if (product1_high > 0 || product > dividend) @@ -339,18 +429,22 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d --quotient.low; // Recalculate with adjusted quotient - product0_low = _umul128(quotient.low, divisor.low, &product0_high); - product1_low = _umul128(quotient.low, divisor.high, &product1_high); + product0_low = int128_umul128_intrinsic(quotient.low, divisor.low, &product0_high); + product1_low = int128_umul128_intrinsic(quotient.low, static_cast(divisor.high), &product1_high); product.low = product0_low; - carry = BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(0, product0_high, product1_low, reinterpret_cast(&product.high)); + product_high_tmp = static_cast(product.high); + carry = int128_addcarry_u64(0, product0_high, product1_low, &product_high_tmp); + product.high = static_cast(product_high_tmp); product1_high += static_cast(carry); } BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(needs_mod) { - auto borrow{BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(0, dividend.low, product.low, &remainder.low)}; - BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(borrow, dividend.high, product.high, reinterpret_cast(&remainder.high)); + auto borrow{int128_subborrow_u64(0, dividend.low, product.low, &remainder.low)}; + std::uint64_t remainder_high_tmp {static_cast(remainder.high)}; + int128_subborrow_u64(borrow, static_cast(dividend.high), static_cast(product.high), &remainder_high_tmp); + remainder.high = static_cast(remainder_high_tmp); } return quotient; @@ -369,8 +463,8 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d quotient.high = high_digit_gte_divisor ? 1 : 0; std::uint64_t remainder_estimate {}; - quotient.low = _udiv128(high_digit_gte_divisor ? high_digit - divisor.high : high_digit, - dividend.high, divisor.high, &remainder_estimate); + quotient.low = int128_udiv128_intrinsic(high_digit_gte_divisor ? high_digit - divisor.high : high_digit, + dividend.high, static_cast(divisor.high), &remainder_estimate); // Bounded correction loop with early exit // Typically 2 is the most number of corrections we need since this is only for 2x2 division @@ -381,7 +475,9 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d while (correction_steps < max_corrections) { T product{}; - product.low = _umul128(quotient.low, divisor.low, reinterpret_cast(&product.high)); + std::uint64_t product_high_tmp {}; + product.low = int128_umul128_intrinsic(quotient.low, divisor.low, &product_high_tmp); + product.high = static_cast(product_high_tmp); if (product <= T{static_cast(remainder_estimate), dividend.low}) { break; @@ -400,23 +496,27 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d // Final verification and adjustment std::uint64_t product0_high{}; - auto product_low {_umul128(quotient.low, divisor.low, &product0_high)}; - auto borrow {BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(0, dividend.low, product_low, ÷nd.low)}; + auto product_low {int128_umul128_intrinsic(quotient.low, divisor.low, &product0_high)}; + auto borrow {int128_subborrow_u64(0, dividend.low, product_low, ÷nd.low)}; std::uint64_t product1_high{}; - product_low = _umul128(quotient.low, divisor.high, &product1_high); - product1_high += static_cast(BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(0, product_low, product0_high, &product_low)); + product_low = int128_umul128_intrinsic(quotient.low, static_cast(divisor.high), &product1_high); + product1_high += static_cast(int128_addcarry_u64(0, product_low, product0_high, &product_low)); - borrow = BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(borrow, static_cast(dividend.high), product_low, reinterpret_cast(÷nd.high)); - borrow = BOOST_DECIMAL_DETAIL_INT128_SUB_BORROW(borrow, high_digit, product1_high, &high_digit); + std::uint64_t dividend_high_tmp {static_cast(dividend.high)}; + borrow = int128_subborrow_u64(borrow, static_cast(dividend.high), product_low, ÷nd_high_tmp); + dividend.high = static_cast(dividend_high_tmp); + borrow = int128_subborrow_u64(borrow, high_digit, product1_high, &high_digit); quotient.low -= static_cast(borrow); BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR (needs_mod) { if (borrow) { - auto carry { BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(0, dividend.low, divisor.low, ÷nd.low) }; - BOOST_DECIMAL_DETAIL_INT128_ADD_CARRY(carry, static_cast(dividend.high), static_cast(divisor.high), reinterpret_cast(÷nd.high)); + auto carry { int128_addcarry_u64(0, dividend.low, divisor.low, ÷nd.low) }; + std::uint64_t dividend_high_tmp2 {static_cast(dividend.high)}; + int128_addcarry_u64(carry, static_cast(dividend.high), static_cast(divisor.high), ÷nd_high_tmp2); + dividend.high = static_cast(dividend_high_tmp2); } dividend >>= shift_amount; @@ -426,6 +526,129 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d return quotient; } +// Moller-Granlund 3/2 reciprocal division for u256 by uint128. +// +// Computes (q, r) such that u = q*d + r, where u is 256-bit (3 limbs effective), +// d is 128-bit (2 limbs, normalized so d.high MSB is set), and v is the +// 64-bit reciprocal of d. +// +// The algorithm requires `d` to be normalized (d.high >= 2^63) and the top +// 128 bits of `u` to be strictly less than `d` so the quotient fits in one +// limb. The d128 division flow satisfies both: the divisor is post- +// expand_significand, so it is in [10^33, 10^34) and shifts to fill 128 bits; +// the dividend top 128 bits after the same shift remain well under 2^127. +// +// The reciprocal v is from reciprocal_2by1 below and must be precomputed. +// References: Moller & Granlund, "Improved Division by Invariant Integers" +// (2010), Algorithm 4. +template +BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_div_3by2( + U128 u_high, std::uint64_t u_low, U128 d, std::uint64_t v, U128& r) noexcept +{ + // q = u_high.high * v + u_high (a 128-bit fused-multiply-add). + // int128_umul128_intrinsic returns the low 64 bits and writes the high + // 64 bits through the out pointer. + std::uint64_t q_high{}; + std::uint64_t q_low {int128_umul128_intrinsic(u_high.high, v, &q_high)}; + // Add u_high to (q_high, q_low). Carry from low into high. + { + const std::uint64_t low_sum {q_low + u_high.low}; + const unsigned char carry_into_high {static_cast(low_sum < q_low ? 1U : 0U)}; + q_low = low_sum; + q_high += u_high.high + carry_into_high; + } + + // r1 = u_high.low - q_high * d.high (single 64-bit limb) + const std::uint64_t r1 {u_high.low - q_high * d.high}; + + // (t_high, t_low) = q_high * d.low (full 128-bit product) + std::uint64_t t_high{}; + const std::uint64_t t_low {int128_umul128_intrinsic(q_high, d.low, &t_high)}; + + // r = (r1, u_low) - (t_high, t_low) - d + std::uint64_t r_low {u_low}; + std::uint64_t r_high {r1}; + unsigned char borrow {int128_subborrow_u64(0, r_low, t_low, &r_low)}; + int128_subborrow_u64(borrow, r_high, t_high, &r_high); + borrow = int128_subborrow_u64(0, r_low, d.low, &r_low); + int128_subborrow_u64(borrow, r_high, d.high, &r_high); + + // q' = q_high + 1 + std::uint64_t quotient {q_high + 1U}; + + // First correction: if r_high >= q_low, q'-- and r += d. + if (r_high >= q_low) + { + --quotient; + const unsigned char carry {int128_addcarry_u64(0, r_low, d.low, &r_low)}; + int128_addcarry_u64(carry, r_high, d.high, &r_high); + } + + // Second correction (very rare): if r >= d, q'++ and r -= d. + const bool r_ge_d {r_high > d.high || (r_high == d.high && r_low >= d.low)}; + if (BOOST_DECIMAL_DETAIL_INT128_UNLIKELY(r_ge_d)) + { + ++quotient; + const unsigned char b2 {int128_subborrow_u64(0, r_low, d.low, &r_low)}; + int128_subborrow_u64(b2, r_high, d.high, &r_high); + } + + r.low = r_low; + r.high = static_cast(r_high); + return quotient; +} + +// Precompute the 64-bit reciprocal for a normalized 128-bit divisor. +// Returns v such that floor((2^192 - 1) / d) = 2^128 + v. +// Precondition: d.high has its MSB set (d >= 2^127). +// Reference: Moller & Granlund 2010, Algorithm 2. +template +BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept +{ + // v_1 = floor((2^128 - 1) / d.high) - 2^64, computed via one 128/64 divide. + // The trick: divq(UINT64_MAX - d.high, UINT64_MAX, d.high) sidesteps the + // overflow trap (the divq precondition that high < divisor) because + // d.high >= 2^63 guarantees UINT64_MAX - d.high < d.high. + std::uint64_t dummy{}; + std::uint64_t v {int128_udiv128_intrinsic(~d.high, UINT64_MAX, d.high, &dummy)}; + + // p = d.high * v + d.low (modular, low 64 bits only; the high part is + // implicit in the residue analysis below). + std::uint64_t p {d.high * v + d.low}; + + // Adjust if the addition wrapped (p < d.low). + if (p < d.low) + { + --v; + if (p >= d.high) + { + --v; + p -= d.high; + } + p -= d.high; + } + + // (t_high, t_low) = v * d.low. + // int128_umul128_intrinsic returns the low limb and writes the high + // limb through the out pointer. + std::uint64_t t_high{}; + const std::uint64_t t_low {int128_umul128_intrinsic(v, d.low, &t_high)}; + + // p += t_high (with overflow detection) + const std::uint64_t p_new {p + t_high}; + if (p_new < p) + { + // Overflow; adjust. + --v; + // If (p_new, t_low) >= d, decrement again. + if (p_new > d.high || (p_new == d.high && t_low >= d.low)) + { + --v; + } + } + return v; +} + #endif } // namespace impl @@ -436,7 +659,7 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T d template BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient) noexcept { - #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) + #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs)) { @@ -444,7 +667,7 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE quotient.high = static_cast(static_cast(lhs.high) / rhs); auto remainder {static_cast(lhs.high) % rhs}; - quotient.low = _udiv128(remainder, lhs.low, rhs, &remainder); + quotient.low = impl::int128_udiv128_intrinsic(remainder, lhs.low, rhs, &remainder); return; } @@ -472,7 +695,7 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE template BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient, T& remainder) noexcept { - #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) + #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs)) { @@ -480,7 +703,7 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE quotient.high = static_cast(static_cast(lhs.high) / rhs); remainder.low = static_cast(lhs.high) % rhs; - quotient.low = _udiv128(remainder.low, lhs.low, rhs, &remainder.low); + quotient.low = impl::int128_udiv128_intrinsic(remainder.low, lhs.low, rhs, &remainder.low); return; } @@ -531,14 +754,14 @@ BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE { BOOST_DECIMAL_DETAIL_INT128_ASSUME(divisor != static_cast(0)); - #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 + #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 - BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(!std::numeric_limits::is_signed) + BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(std::is_unsigned::value) { if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(dividend)) { T remainder{}; - return impl::div_mod_msvc(dividend, divisor, remainder); + return impl::div_mod_intrinsic(dividend, divisor, remainder); } } @@ -561,14 +784,14 @@ template BOOST_DECIMAL_DETAIL_INT128_HOST_DEVICE BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divisor, T& remainder) noexcept { BOOST_DECIMAL_DETAIL_INT128_ASSUME(divisor != static_cast(0)); - - #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 - BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(!std::numeric_limits::is_signed) + #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 + + BOOST_DECIMAL_DETAIL_INT128_IF_CONSTEXPR(std::is_unsigned::value) { if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(dividend)) { - return impl::div_mod_msvc(dividend, divisor, remainder); + return impl::div_mod_intrinsic(dividend, divisor, remainder); } } From 337eb77d53ced85a9b42e409ed00211d827b9e56 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 10 Jun 2026 08:56:38 -0400 Subject: [PATCH 5/8] Update impls --- include/boost/decimal/decimal128_t.hpp | 28 ++++++----------- include/boost/decimal/decimal_fast128_t.hpp | 33 +++++---------------- include/boost/decimal/decimal_fast64_t.hpp | 20 +------------ 3 files changed, 17 insertions(+), 64 deletions(-) diff --git a/include/boost/decimal/decimal128_t.hpp b/include/boost/decimal/decimal128_t.hpp index adc3ec3fa..0579b11aa 100644 --- a/include/boost/decimal/decimal128_t.hpp +++ b/include/boost/decimal/decimal128_t.hpp @@ -1606,14 +1606,14 @@ std::ostream& operator<<( std::ostream& os, boost::decimal::detail::builtin_uint BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_div_impl(const decimal128_t& lhs, const decimal128_t& rhs, decimal128_t& q, decimal128_t& r) noexcept -> void { + const bool sign {lhs.isneg() != rhs.isneg()}; + #ifndef BOOST_DECIMAL_FAST_MATH // Check pre-conditions constexpr decimal128_t zero {0, 0}; constexpr decimal128_t nan {from_bits(detail::d128_nan_mask)}; constexpr decimal128_t inf {from_bits(detail::d128_inf_mask)}; - const bool sign {lhs.isneg() != rhs.isneg()}; - const auto lhs_fp {fpclassify(lhs)}; const auto rhs_fp {fpclassify(rhs)}; @@ -1711,11 +1711,7 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_div_impl(const decimal128_t& lhs, const d << "\nexp rhs: " << exp_rhs << std::endl; #endif - detail::decimal128_t_components q_components {}; - - detail::d128_generic_div_impl(lhs_components, rhs.to_components(), q_components); - - q = decimal128_t(q_components.sig, q_components.exp, q_components.sign); + q = detail::d128_generic_div_impl(lhs_components, rhs.to_components(), sign); } #ifdef _MSC_VER @@ -2010,13 +2006,13 @@ template BOOST_DECIMAL_CUDA_CONSTEXPR auto operator/(const decimal128_t lhs, const Integer rhs) noexcept BOOST_DECIMAL_REQUIRES_RETURN(detail::is_integral_v, Integer, decimal128_t) { + const bool sign {lhs.isneg() != (rhs < 0)}; + #ifndef BOOST_DECIMAL_FAST_MATH // Check pre-conditions constexpr decimal128_t zero {0, 0}; constexpr decimal128_t inf {from_bits(detail::d128_inf_mask)}; - const bool sign {lhs.isneg() != (rhs < 0)}; - const auto lhs_fp {fpclassify(lhs)}; switch (lhs_fp) @@ -2041,24 +2037,21 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator/(const decimal128_t lhs, const Intege detail::expand_significand(lhs_components.sig, lhs_components.exp); const detail::decimal128_t_components rhs_components {detail::make_positive_unsigned(rhs), 0, rhs < 0}; - detail::decimal128_t_components q_components {}; - detail::d128_generic_div_impl(lhs_components, rhs_components, q_components); - - return decimal128_t(q_components.sig, q_components.exp, q_components.sign); + return detail::d128_generic_div_impl(lhs_components, rhs_components, sign); } template BOOST_DECIMAL_CUDA_CONSTEXPR auto operator/(const Integer lhs, const decimal128_t rhs) noexcept BOOST_DECIMAL_REQUIRES_RETURN(detail::is_integral_v, Integer, decimal128_t) { + const bool sign {(lhs < 0) != rhs.isneg()}; + #ifndef BOOST_DECIMAL_FAST_MATH // Check pre-conditions constexpr decimal128_t zero {0, 0}; constexpr decimal128_t inf {from_bits(detail::d128_inf_mask)}; - const bool sign {(lhs < 0) != rhs.isneg()}; - const auto rhs_fp {fpclassify(rhs)}; switch (rhs_fp) @@ -2078,11 +2071,8 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator/(const Integer lhs, const decimal128_ detail::expand_significand(rhs_components.sig, rhs_components.exp); const detail::decimal128_t_components lhs_components {detail::make_positive_unsigned(lhs), 0, lhs < 0}; - detail::decimal128_t_components q_components {}; - - detail::d128_generic_div_impl(lhs_components, rhs_components, q_components); - return decimal128_t(q_components.sig, q_components.exp, q_components.sign); + return detail::d128_generic_div_impl(lhs_components, rhs_components, sign); } BOOST_DECIMAL_CUDA_CONSTEXPR auto operator%(const decimal128_t& lhs, const decimal128_t& rhs) noexcept -> decimal128_t diff --git a/include/boost/decimal/decimal_fast128_t.hpp b/include/boost/decimal/decimal_fast128_t.hpp index 052282b18..109ec686a 100644 --- a/include/boost/decimal/decimal_fast128_t.hpp +++ b/include/boost/decimal/decimal_fast128_t.hpp @@ -1399,20 +1399,7 @@ constexpr auto d128f_div_impl(const decimal_fast128_t& lhs, const decimal_fast12 static_cast(r); #endif - #ifdef BOOST_DECIMAL_DEBUG - std::cerr << "sig lhs: " << sig_lhs - << "\nexp lhs: " << exp_lhs - << "\nsig rhs: " << sig_rhs - << "\nexp rhs: " << exp_rhs << std::endl; - #endif - - constexpr auto ten_pow_precision {detail::pow10(int128::uint128_t(detail::precision_v))}; - const auto big_sig_lhs {detail::umul256(lhs.significand_, ten_pow_precision)}; - - const auto res_sig {big_sig_lhs / rhs.significand_}; - const auto res_exp {lhs.biased_exponent() - rhs.biased_exponent() - detail::precision_v}; - - q = decimal_fast128_t(static_cast(res_sig), res_exp, sign); + q = detail::d128_generic_div_impl(lhs.to_components(), rhs.to_components(), sign); } constexpr auto operator/(const decimal_fast128_t& lhs, const decimal_fast128_t& rhs) noexcept -> decimal_fast128_t @@ -1428,13 +1415,13 @@ template constexpr auto operator/(const decimal_fast128_t& lhs, const Integer rhs) noexcept BOOST_DECIMAL_REQUIRES_RETURN(detail::is_integral_v, Integer, decimal_fast128_t) { + const bool sign {lhs.isneg() != (rhs < 0)}; + #ifndef BOOST_DECIMAL_FAST_MATH // Check pre-conditions constexpr decimal_fast128_t zero {0, 0}; constexpr decimal_fast128_t inf {direct_init_d128(detail::d128_fast_inf, 0, false)}; - const bool sign {lhs.isneg() != (rhs < 0)}; - const auto lhs_fp {fpclassify(lhs)}; switch (lhs_fp) @@ -1459,24 +1446,21 @@ constexpr auto operator/(const decimal_fast128_t& lhs, const Integer rhs) noexce const auto rhs_sig {detail::make_positive_unsigned(rhs)}; const detail::decimal_fast128_t_components rhs_components {rhs_sig, 0, rhs < 0}; - detail::decimal_fast128_t_components q_components {}; - - detail::d128_generic_div_impl(lhs_components, rhs_components, q_components); - return {q_components.sig, q_components.exp, q_components.sign}; + return detail::d128_generic_div_impl(lhs_components, rhs_components, sign); } template constexpr auto operator/(const Integer lhs, const decimal_fast128_t& rhs) noexcept BOOST_DECIMAL_REQUIRES_RETURN(detail::is_integral_v, Integer, decimal_fast128_t) { + const bool sign {(lhs < 0) != rhs.isneg()}; + #ifndef BOOST_DECIMAL_FAST_MATH // Check pre-conditions constexpr decimal_fast128_t zero {0, 0}; constexpr decimal_fast128_t inf {direct_init_d128(detail::d128_fast_inf, 0, false)}; - const bool sign {(lhs < 0) != rhs.isneg()}; - const auto rhs_fp {fpclassify(rhs)}; switch (rhs_fp) @@ -1494,11 +1478,8 @@ constexpr auto operator/(const Integer lhs, const decimal_fast128_t& rhs) noexce const detail::decimal_fast128_t_components lhs_components {detail::make_positive_unsigned(lhs), 0, lhs < 0}; const detail::decimal_fast128_t_components rhs_components {rhs.significand_, rhs.biased_exponent(), rhs.isneg()}; - detail::decimal_fast128_t_components q_components {}; - - detail::d128_generic_div_impl(lhs_components, rhs_components, q_components); - return {q_components.sig, q_components.exp, q_components.sign}; + return detail::d128_generic_div_impl(lhs_components, rhs_components, sign); } constexpr auto operator%(const decimal_fast128_t& lhs, const decimal_fast128_t& rhs) noexcept -> decimal_fast128_t diff --git a/include/boost/decimal/decimal_fast64_t.hpp b/include/boost/decimal/decimal_fast64_t.hpp index 6f4046098..07deba724 100644 --- a/include/boost/decimal/decimal_fast64_t.hpp +++ b/include/boost/decimal/decimal_fast64_t.hpp @@ -1521,25 +1521,7 @@ constexpr auto d64_fast_div_impl(const decimal_fast64_t& lhs, const decimal_fast static_cast(r); #endif - #ifdef BOOST_DECIMAL_DEBUG - std::cerr << "sig lhs: " << sig_lhs - << "\nexp lhs: " << exp_lhs - << "\nsig rhs: " << sig_rhs - << "\nexp rhs: " << exp_rhs << std::endl; - #endif - - using unsigned_int128_type = boost::int128::uint128_t; - - // If rhs is greater than we need to offset the significands to get the correct values - // e.g. 4/8 is 0 but 40/8 yields 5 in integer maths - constexpr auto offset {std::numeric_limits::digits10 - detail::precision_v}; - constexpr auto tens_needed {detail::pow10(static_cast(offset))}; - const auto big_sig_lhs {static_cast(lhs.significand_) * tens_needed}; - - const auto res_sig {big_sig_lhs / rhs.significand_}; - const auto res_exp {(lhs.biased_exponent() - offset) - rhs.biased_exponent()}; - - q = decimal_fast64_t{res_sig, res_exp, sign}; + q = detail::d64_generic_div_impl(lhs.to_components(), rhs.to_components(), sign); } constexpr auto operator/(const decimal_fast64_t& lhs, const decimal_fast64_t& rhs) noexcept -> decimal_fast64_t From 5bcaa5d77a3eb5f3407dcf66e7a599632d0f18b2 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 17 Jun 2026 13:13:47 -0400 Subject: [PATCH 6/8] Mark function as constexpr --- include/boost/decimal/detail/u256.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp index 58645df88..5959ae03e 100644 --- a/include/boost/decimal/detail/u256.hpp +++ b/include/boost/decimal/detail/u256.hpp @@ -1135,7 +1135,7 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u // path) or when the dividend's top 128 bits are not strictly less than the // normalized divisor (the 3/2 algorithm's precondition for a single-limb // quotient). -BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128( +constexpr BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128( u256 u, int128::uint128_t d, int128::uint128_t& q, int128::uint128_t& r) noexcept { From b9e731ee243c6c8064a374fd648648681b7879b5 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 17 Jun 2026 15:48:52 -0400 Subject: [PATCH 7/8] Mark function as constexpr --- include/boost/decimal/detail/int128/detail/common_div.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/decimal/detail/int128/detail/common_div.hpp b/include/boost/decimal/detail/int128/detail/common_div.hpp index d5d5c828b..361dd8dc4 100644 --- a/include/boost/decimal/detail/int128/detail/common_div.hpp +++ b/include/boost/decimal/detail/int128/detail/common_div.hpp @@ -603,7 +603,7 @@ BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_div_3by2( // Precondition: d.high has its MSB set (d >= 2^127). // Reference: Moller & Granlund 2010, Algorithm 2. template -BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept +constexpr BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept { // v_1 = floor((2^128 - 1) / d.high) - 2^64, computed via one 128/64 divide. // The trick: divq(UINT64_MAX - d.high, UINT64_MAX, d.high) sidesteps the From ac5c2acd2d5e165b55cc66fea3a9cae6b175fc33 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 18 Jun 2026 08:34:39 -0400 Subject: [PATCH 8/8] More x64 fixes --- .../detail/int128/detail/common_div.hpp | 2 +- include/boost/decimal/detail/u256.hpp | 29 +++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/include/boost/decimal/detail/int128/detail/common_div.hpp b/include/boost/decimal/detail/int128/detail/common_div.hpp index 361dd8dc4..d5d5c828b 100644 --- a/include/boost/decimal/detail/int128/detail/common_div.hpp +++ b/include/boost/decimal/detail/int128/detail/common_div.hpp @@ -603,7 +603,7 @@ BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_div_3by2( // Precondition: d.high has its MSB set (d >= 2^127). // Reference: Moller & Granlund 2010, Algorithm 2. template -constexpr BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept +BOOST_DECIMAL_DETAIL_INT128_FORCE_INLINE std::uint64_t mg32_reciprocal_2by1(U128 d) noexcept { // v_1 = floor((2^128 - 1) / d.high) - 2^64, computed via one 128/64 divide. // The trick: divq(UINT64_MAX - d.high, UINT64_MAX, d.high) sidesteps the diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp index 5959ae03e..cffe1aa5d 100644 --- a/include/boost/decimal/detail/u256.hpp +++ b/include/boost/decimal/detail/u256.hpp @@ -1135,7 +1135,7 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u // path) or when the dividend's top 128 bits are not strictly less than the // normalized divisor (the 3/2 algorithm's precondition for a single-limb // quotient). -constexpr BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128( +BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128( u256 u, int128::uint128_t d, int128::uint128_t& q, int128::uint128_t& r) noexcept { @@ -1188,6 +1188,22 @@ constexpr BOOST_DECIMAL_FORCE_INLINE bool mg32_u256_by_u128( #endif // BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 +// True when the divisor fits in 128 bits, i.e. the MG u256-by-u128 fast path +// is applicable. A u256 divisor only qualifies when its upper 128 bits are +// clear; static_cast would otherwise truncate it silently and the +// fast path would divide by the wrong value. Narrower integer types always +// qualify. +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR bool divisor_fits_u128(const u256& rhs) noexcept +{ + return rhs.bytes[2] == UINT64_C(0) && rhs.bytes[3] == UINT64_C(0); +} + +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR bool divisor_fits_u128(const UnsignedInteger&) noexcept +{ + return true; +} + template BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u256& lhs, const UnsignedInteger& rhs) noexcept { @@ -1200,11 +1216,15 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR u256 default_div(const u return u256{}; } - #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 + #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) // MG 3/2 fast path. Replaces ~3 hardware divides in the 32-bit Knuth-D // outer loop with one reciprocal-compute plus two cheap 3/2 inner steps. // Bails out (returns false) for inputs that don't fit the algorithm's // single-limb-quotient shape; those fall through to Knuth-D below. + // Runtime-only: the MG helpers call hardware intrinsics that are not + // usable in a constant expression, so this path is gated behind + // IS_CONSTANT_EVALUATED (and compiled out when detection is unavailable). + if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs) && divisor_fits_u128(rhs)) { const int128::uint128_t rhs_u128 {static_cast(rhs)}; int128::uint128_t mg_q{}; @@ -1242,10 +1262,13 @@ struct u256_divmod_result template BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto div_mod(const u256& lhs, const UnsignedInteger& rhs) noexcept -> u256_divmod_result { - #ifdef BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128 + #if defined(BOOST_DECIMAL_DETAIL_INT128_HAS_FAST_DIV128) && !defined(BOOST_DECIMAL_DETAIL_INT128_NO_CONSTEVAL_DETECTION) // MG 3/2 fast path for u256/uint128. See default_div for rationale. // Single-limb divisors fall through to the 32-bit Knuth-D path below, // which has a dedicated n==1 short-circuit. + // Runtime-only: gated behind IS_CONSTANT_EVALUATED because the MG helpers + // call hardware intrinsics that are not usable in a constant expression. + if (!BOOST_DECIMAL_DETAIL_INT128_IS_CONSTANT_EVALUATED(lhs) && divisor_fits_u128(rhs)) { const int128::uint128_t rhs_u128 {static_cast(rhs)}; if (rhs_u128.high != 0U)