Skip to content
Merged
18 changes: 0 additions & 18 deletions benchmarks/src/replace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,6 @@ void rc(benchmark::State& state) {
}
}

template <class T>
void rc_if(benchmark::State& state) {
std::vector<T, not_highly_aligned_allocator<T>> a(lorem_ipsum.begin(), lorem_ipsum.end());
std::vector<T, not_highly_aligned_allocator<T>> b(lorem_ipsum.size());

for (auto _ : state) {
benchmark::DoNotOptimize(a);
(void) std::replace_copy_if(
std::begin(a), std::end(a), std::begin(b), [](auto x) { return x <= T{'Z'}; }, T{'X'});
benchmark::DoNotOptimize(b);
}
}

// replace() is vectorized for 4 and 8 bytes only.
BENCHMARK(r<std::uint32_t>);
BENCHMARK(r<std::uint64_t>);
Expand All @@ -56,9 +43,4 @@ BENCHMARK(rc<std::uint16_t>);
BENCHMARK(rc<std::uint32_t>);
BENCHMARK(rc<std::uint64_t>);

BENCHMARK(rc_if<std::uint8_t>);
BENCHMARK(rc_if<std::uint16_t>);
BENCHMARK(rc_if<std::uint32_t>);
BENCHMARK(rc_if<std::uint64_t>);

BENCHMARK_MAIN();
144 changes: 107 additions & 37 deletions stl/inc/algorithm
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,17 @@ __declspec(noalias) void __stdcall __std_replace_8(
void* _First, void* _Last, uint64_t _Old_val, uint64_t _New_val) noexcept;
#endif // ^^^ _VECTORIZED_REPLACE ^^^

#if _VECTORIZED_REPLACE_COPY
__declspec(noalias) void __stdcall __std_replace_copy_1(
const void* _First, const void* _Last, void* _Dest, uint8_t _Old_val, uint8_t _New_val) noexcept;
__declspec(noalias) void __stdcall __std_replace_copy_2(
const void* _First, const void* _Last, void* _Dest, uint16_t _Old_val, uint16_t _New_val) noexcept;
__declspec(noalias) void __stdcall __std_replace_copy_4(
const void* _First, const void* _Last, void* _Dest, uint32_t _Old_val, uint32_t _New_val) noexcept;
__declspec(noalias) void __stdcall __std_replace_copy_8(
const void* _First, const void* _Last, void* _Dest, uint64_t _Old_val, uint64_t _New_val) noexcept;
#endif // ^^^ _VECTORIZED_REPLACE_COPY ^^^

#if _VECTORIZED_SEARCH_N
const void* __stdcall __std_search_n_1(const void* _First, const void* _Last, size_t _Count, uint8_t _Value) noexcept;
const void* __stdcall __std_search_n_2(const void* _First, const void* _Last, size_t _Count, uint16_t _Value) noexcept;
Expand Down Expand Up @@ -358,6 +369,28 @@ __declspec(noalias) void _Replace_vectorized(
}
#endif // ^^^ _VECTORIZED_REPLACE ^^^

#if _VECTORIZED_REPLACE_COPY
template <class _Ty, class _TVal1, class _TVal2>
__declspec(noalias) void _Replace_copy_vectorized(const _Ty* const _First, const _Ty* const _Last, _Ty* const _Dest,
const _TVal1 _Old_val, const _TVal2 _New_val) noexcept {
if constexpr (sizeof(_Ty) == 1) {
::__std_replace_copy_1(
_First, _Last, _Dest, _STD _Find_arg_cast<uint8_t>(_Old_val), _STD _Find_arg_cast<uint8_t>(_New_val));
} else if constexpr (sizeof(_Ty) == 2) {
::__std_replace_copy_2(
_First, _Last, _Dest, _STD _Find_arg_cast<uint16_t>(_Old_val), _STD _Find_arg_cast<uint16_t>(_New_val));
} else if constexpr (sizeof(_Ty) == 4) {
::__std_replace_copy_4(
_First, _Last, _Dest, _STD _Find_arg_cast<uint32_t>(_Old_val), _STD _Find_arg_cast<uint32_t>(_New_val));
} else if constexpr (sizeof(_Ty) == 8) {
::__std_replace_copy_8(
_First, _Last, _Dest, _STD _Find_arg_cast<uint64_t>(_Old_val), _STD _Find_arg_cast<uint64_t>(_New_val));
} else {
static_assert(false, "unexpected size");
}
}
#endif // ^^^ _VECTORIZED_REPLACE_COPY ^^^

#if _VECTORIZED_SEARCH_N
template <class _Ty, class _TVal>
_Ty* _Search_n_vectorized(_Ty* const _First, _Ty* const _Last, const size_t _Count, const _TVal _Val) noexcept {
Expand Down Expand Up @@ -476,6 +509,17 @@ constexpr bool _Output_iterator_for_vector_alg_is_safe() {
}
#endif // ^^^ _VECTORIZED_REMOVE_COPY || _VECTORIZED_UNIQUE_COPY ^^^

#if _VECTORIZED_REPLACE_COPY
template <class _Out, class _In>
constexpr bool _Output_iterator_for_known_size_vector_alg_is_safe() {
if constexpr (_Iterator_is_contiguous<_Out>) {
return is_same_v<_Iter_value_t<_Out>, remove_const_t<_Iter_value_t<_In>>>;
} else {
return false;
}
}
#endif // ^^^ _VECTORIZED_REPLACE_COPY ^^^

#if _VECTORIZED_INCLUDES
// Can we activate the vector algorithms for includes?
template <class _Iter1, class _Iter2, class _Elem = _Iter_value_t<_Iter1>>
Expand Down Expand Up @@ -4562,15 +4606,6 @@ namespace ranges {
} // namespace ranges
#endif // _HAS_CXX20

// TRANSITION, DevCom-10606350: help the compiler auto-vectorize for simple types
template <class _UOutIt, class _InTy, class _NewTy, class _OutTy = remove_pointer_t<_UOutIt>>
constexpr bool _Can_vectorize_replace_copy = conjunction_v<is_pointer<_UOutIt>, is_same<_InTy, _NewTy>,
disjunction<
#ifdef __cpp_lib_byte
conjunction<is_same<_InTy, byte>, is_same<_OutTy, byte>>,
#endif // defined(__cpp_lib_byte)
conjunction<is_integral<_InTy>, is_integral<_OutTy>>, conjunction<is_pointer<_InTy>, is_pointer<_OutTy>>>>;

_EXPORT_STD template <class _InIt, class _OutIt, class _Ty>
_CONSTEXPR20 _OutIt replace_copy(_InIt _First, _InIt _Last, _OutIt _Dest, const _Ty& _Oldval, const _Ty& _Newval) {
// copy replacing each matching _Oldval with _Newval
Expand All @@ -4579,15 +4614,36 @@ _CONSTEXPR20 _OutIt replace_copy(_InIt _First, _InIt _Last, _OutIt _Dest, const
auto _UFirst = _STD _Get_unwrapped(_First);
const auto _ULast = _STD _Get_unwrapped(_Last);
auto _UDest = _STD _Get_unwrapped_n(_Dest, _STD _Idl_distance<_InIt>(_UFirst, _ULast));
for (; _UFirst != _ULast; ++_UFirst, (void) ++_UDest) {
if constexpr (_Can_vectorize_replace_copy<decltype(_UDest), _Iter_value_t<_InIt>, _Ty>) {
*_UDest = *_UFirst == _Oldval ? _Newval : *_UFirst;
} else {
if (*_UFirst == _Oldval) {
*_UDest = _Newval;

#if _VECTORIZED_REPLACE_COPY
if constexpr (_Vector_alg_in_find_is_safe<decltype(_UFirst), _Ty>
&& _Output_iterator_for_known_size_vector_alg_is_safe<decltype(_UDest), decltype(_UFirst)>()) {
if (!_STD _Is_constant_evaluated()) {
const auto _Count = static_cast<_Iter_diff_t<decltype(_UDest)>>(_ULast - _UFirst);
_STD _Contiguous_iter_verify(_UDest, _Count);

const auto _First_ptr = _STD _To_address(_UFirst);
const auto _Last_ptr = _STD _To_address(_ULast);
Comment thread
StephanTLavavej marked this conversation as resolved.
const auto _Dest_ptr = _STD _To_address(_UDest);

if (_STD _Could_compare_equal_to_value_type<decltype(_UFirst)>(_Oldval)) {
_STD _Replace_copy_vectorized(_First_ptr, _Last_ptr, _Dest_ptr, _Oldval, _Newval);
} else {
*_UDest = *_UFirst;
_CSTD memcpy(_Dest_ptr, _First_ptr, static_cast<size_t>(_Count) * sizeof(*_Dest_ptr));
}

_UDest += _Count;
_STD _Seek_wrapped(_Dest, _UDest);
return _Dest;
}
}
#endif // ^^^ _VECTORIZED_REPLACE_COPY ^^^

for (; _UFirst != _ULast; ++_UFirst, (void) ++_UDest) {
if (*_UFirst == _Oldval) {
*_UDest = _Newval;
} else {
*_UDest = *_UFirst;
}
}

Expand Down Expand Up @@ -4660,15 +4716,37 @@ namespace ranges {

_STD _Verify_ranges_do_not_overlap(_First, _Last, _Output);

for (; _First != _Last; ++_First, (void) ++_Output) {
if constexpr (_Can_vectorize_replace_copy<_Out, iter_value_t<_It>, _Ty2>) {
*_Output = _STD invoke(_Proj, *_First) == _Oldval ? _Newval : *_First;
} else {
if (_STD invoke(_Proj, *_First) == _Oldval) {
*_Output = _Newval;
#if _VECTORIZED_REPLACE_COPY
if constexpr (is_same_v<_Pj, identity> && sized_sentinel_for<_Se, _It>
&& _Vector_alg_in_find_is_safe<_It, _Ty1> && _Vector_alg_in_find_is_safe<_It, _Ty2>
&& _Output_iterator_for_known_size_vector_alg_is_safe<_Out, _It>()) {
if (!_STD is_constant_evaluated()) {
const auto _Count = _Last - _First;
_STD _Contiguous_iter_verify(_First, _Count);
_STD _Contiguous_iter_verify(_Output, static_cast<iter_difference_t<_Out>>(_Count));

const auto _First_ptr = _STD to_address(_First);
const auto _Last_ptr = _First_ptr + static_cast<size_t>(_Count);
const auto _Out_ptr = _STD to_address(_Output);

if (_STD _Could_compare_equal_to_value_type<_It>(_Oldval)) {
_STD _Replace_copy_vectorized(_First_ptr, _Last_ptr, _Out_ptr, _Oldval, _Newval);
} else {
*_Output = *_First;
_CSTD memcpy(_Out_ptr, _First_ptr, static_cast<size_t>(_Count) * sizeof(*_Out_ptr));
}

_First += _Count;
_Output += static_cast<iter_difference_t<_Out>>(_Count);
return {_STD move(_First), _STD move(_Output)};
}
}
#endif // ^^^ _VECTORIZED_REPLACE_COPY ^^^

for (; _First != _Last; ++_First, (void) ++_Output) {
if (_STD invoke(_Proj, *_First) == _Oldval) {
*_Output = _Newval;
} else {
*_Output = *_First;
}
}

Expand All @@ -4689,14 +4767,10 @@ _CONSTEXPR20 _OutIt replace_copy_if(_InIt _First, _InIt _Last, _OutIt _Dest, _Pr
const auto _ULast = _STD _Get_unwrapped(_Last);
auto _UDest = _STD _Get_unwrapped_n(_Dest, _STD _Idl_distance<_InIt>(_UFirst, _ULast));
for (; _UFirst != _ULast; ++_UFirst, (void) ++_UDest) {
if constexpr (_Can_vectorize_replace_copy<decltype(_UDest), _Iter_value_t<_InIt>, _Ty>) {
*_UDest = _Pred(*_UFirst) ? _Val : *_UFirst;
if (_Pred(*_UFirst)) {
*_UDest = _Val;
} else {
if (_Pred(*_UFirst)) {
*_UDest = _Val;
} else {
*_UDest = *_UFirst;
}
*_UDest = *_UFirst;
}
}

Expand Down Expand Up @@ -4771,14 +4845,10 @@ namespace ranges {
_STD _Verify_ranges_do_not_overlap(_First, _Last, _Output);

for (; _First != _Last; ++_First, (void) ++_Output) {
if constexpr (_Can_vectorize_replace_copy<_Out, iter_value_t<_It>, _Ty>) {
*_Output = _STD invoke(_Pred, _STD invoke(_Proj, *_First)) ? _Newval : *_First;
if (_STD invoke(_Pred, _STD invoke(_Proj, *_First))) {
*_Output = _Newval;
} else {
if (_STD invoke(_Pred, _STD invoke(_Proj, *_First))) {
*_Output = _Newval;
} else {
*_Output = *_First;
}
*_Output = *_First;
}
}

Expand Down
1 change: 1 addition & 0 deletions stl/inc/xutility
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ _STL_DISABLE_CLANG_WARNINGS
#define _VECTORIZED_REMOVE _VECTORIZED_FOR_X64_X86
#define _VECTORIZED_REMOVE_COPY _VECTORIZED_FOR_X64_X86
#define _VECTORIZED_REPLACE _VECTORIZED_FOR_X64_X86
#define _VECTORIZED_REPLACE_COPY _VECTORIZED_FOR_X64_X86
#define _VECTORIZED_REVERSE _VECTORIZED_FOR_X64_X86_ARM64
#define _VECTORIZED_REVERSE_COPY _VECTORIZED_FOR_X64_X86_ARM64
#define _VECTORIZED_ROTATE _VECTORIZED_FOR_X64_X86_ARM64
Expand Down
94 changes: 94 additions & 0 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6486,6 +6486,80 @@ __declspec(noalias) size_t __stdcall __std_mismatch_8(
return _Mismatching::_Mismatch_impl<uint64_t>(_First1, _First2, _Count);
}

} // extern "C"

namespace {
namespace _Replacing {
template <class _Traits, class _Ty>
__declspec(noalias) void __stdcall _Replace_copy_impl(
const void* _First, const void* const _Last, void* _Dest, const _Ty _Old_val, const _Ty _New_val) noexcept {
#ifndef _M_ARM64EC
const size_t _Size_bytes = _Byte_length(_First, _Last);

if (const size_t _Avx_size = _Size_bytes & ~size_t{0x1F}; _Avx_size != 0 && _Use_avx2()) {
const __m256i _Comparand = _Traits::_Set_avx(_Old_val);
const __m256i _Replacement = _Traits::_Set_avx(_New_val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Avx_size);

do {
const __m256i _Data = _mm256_loadu_si256(static_cast<const __m256i*>(_First));
const __m256i _Mask = _Traits::_Cmp_avx(_Data, _Comparand);
const __m256i _Val = _mm256_blendv_epi8(_Data, _Replacement, _Mask);

_mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Val);

_Advance_bytes(_First, 32);
_Advance_bytes(_Dest, 32);
} while (_First != _Stop_at);

if (const size_t _Avx_tail_size = _Size_bytes & 0x1C; _Avx_tail_size != 0) {
const __m256i _Tail_mask = _Avx2_tail_mask_32(_Avx_tail_size);
const __m256i _Data = _mm256_maskload_epi32(static_cast<const int*>(_First), _Tail_mask);
const __m256i _Mask = _Traits::_Cmp_avx(_Data, _Comparand);
const __m256i _Val = _mm256_blendv_epi8(_Data, _Replacement, _Mask);

_mm256_maskstore_epi32(static_cast<int*>(_Dest), _Tail_mask, _Val);

_Advance_bytes(_First, _Avx_tail_size);
_Advance_bytes(_Dest, _Avx_tail_size);
}

_mm256_zeroupper(); // TRANSITION, DevCom-10331414

if constexpr (sizeof(_Ty) >= 4) {
return;
}
} else if (const size_t _Sse_size = _Size_bytes & ~size_t{0xF}; _Sse_size != 0 && _Use_sse42()) {
const __m128i _Comparand = _Traits::_Set_sse(_Old_val);
const __m128i _Replacement = _Traits::_Set_sse(_New_val);
const void* _Stop_at = _First;
_Advance_bytes(_Stop_at, _Sse_size);

do {
const __m128i _Data = _mm_loadu_si128(static_cast<const __m128i*>(_First));
const __m128i _Mask = _Traits::_Cmp_sse(_Data, _Comparand);
const __m128i _Val = _mm_blendv_epi8(_Data, _Replacement, _Mask);

_mm_storeu_si128(static_cast<__m128i*>(_Dest), _Val);

_Advance_bytes(_First, 16);
_Advance_bytes(_Dest, 16);
} while (_First != _Stop_at);
}
#endif // ^^^ !defined(_M_ARM64EC) ^^^
auto _Ptr_dest = static_cast<_Ty*>(_Dest);
for (auto _Ptr_src = static_cast<const _Ty*>(_First); _Ptr_src != _Last; ++_Ptr_src) {
const _Ty _Val = *_Ptr_src;
*_Ptr_dest = _Val == _Old_val ? _New_val : _Val;
++_Ptr_dest;
}
}
} // namespace _Replacing
} // unnamed namespace

extern "C" {

__declspec(noalias) void __stdcall __std_replace_4(
void* _First, void* const _Last, const uint32_t _Old_val, const uint32_t _New_val) noexcept {
#ifndef _M_ARM64EC
Expand Down Expand Up @@ -6567,6 +6641,26 @@ __declspec(noalias) void __stdcall __std_replace_8(
}
}

__declspec(noalias) void __stdcall __std_replace_copy_1(const void* const _First, const void* const _Last,
void* const _Dest, const uint8_t _Old_val, const uint8_t _New_val) noexcept {
_Replacing::_Replace_copy_impl<_Finding::_Find_traits_1>(_First, _Last, _Dest, _Old_val, _New_val);
}

__declspec(noalias) void __stdcall __std_replace_copy_2(const void* const _First, const void* const _Last,
void* const _Dest, const uint16_t _Old_val, const uint16_t _New_val) noexcept {
_Replacing::_Replace_copy_impl<_Finding::_Find_traits_2>(_First, _Last, _Dest, _Old_val, _New_val);
}

__declspec(noalias) void __stdcall __std_replace_copy_4(const void* const _First, const void* const _Last,
void* const _Dest, const uint32_t _Old_val, const uint32_t _New_val) noexcept {
_Replacing::_Replace_copy_impl<_Finding::_Find_traits_4>(_First, _Last, _Dest, _Old_val, _New_val);
}

__declspec(noalias) void __stdcall __std_replace_copy_8(const void* const _First, const void* const _Last,
void* const _Dest, const uint64_t _Old_val, const uint64_t _New_val) noexcept {
_Replacing::_Replace_copy_impl<_Finding::_Find_traits_8>(_First, _Last, _Dest, _Old_val, _New_val);
}

} // extern "C"

namespace {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,11 +221,17 @@ int main() {
assert(r_rot_it == temp_end - rotate_pos);
}
{
// Out of replace family, only replace for 32-bit and 64-bit elements is manually vectorized,
// replace_copy is auto vectorized (along with replace_copy_if)
const int replace_expected[] = {
200, 210, 220, 333, 240, 333, 333, 270, 280, 290, 300, 310, 320, 333, 340, 333, 333, 370, 380, 390};

auto repl_copy_it = replace_copy(arr_begin, arr_end, temp_begin, 250, 333);
assert(equal(temp_begin, temp_end, begin(replace_expected), end(replace_expected)));
assert(repl_copy_it == temp_end);

auto r_repl_copy_it = ranges::replace_copy(arr_begin, arr_end, temp_begin, 250, 333).out;
assert(ranges::equal(temp_begin, temp_end, begin(replace_expected), end(replace_expected)));
assert(r_repl_copy_it == temp_end);

copy(arr_begin, arr_end, temp_begin);
replace(temp_begin, temp_end, 250, 333);
assert(equal(temp_begin, temp_end, begin(replace_expected), end(replace_expected)));
Expand Down
Loading