QuickCppLib 0.10
Eliminate all the tedious hassle when making state-of-the-art C++ 14 - 23 libraries!
Loading...
Searching...
No Matches
quickcpplib::_xxx::algorithm::bit_interleave Namespace Reference

Namespaces

namespace  detail
 

Classes

struct  bit_deinterleave_result
 

Functions

template<class T , class R = typename detail::next_larger<T>::type, typename std::enable_if<(std::is_unsigned< T >::value), bool >::type = true>
bit_interleave (T a, T b) noexcept
 Interleaves the bits of \emph a and \emph b.
 
template<class T , class R = typename detail::next_smaller<T>::type, typename std::enable_if<(std::is_unsigned< T >::value), bool >::type = true>
bit_deinterleave_result< R > bit_deinterleave (T x) noexcept
 Deinterleaves the bits in X into evens and odds.
 

Function Documentation

◆ bit_interleave()

template<class T , class R = typename detail::next_larger<T>::type, typename std::enable_if<(std::is_unsigned< T >::value), bool >::type = true>
R quickcpplib::_xxx::algorithm::bit_interleave::bit_interleave ( a,
b 
)
inlinenoexcept

Interleaves the bits of \emph a and \emph b.

On my Intel i7-8565u laptop able to boost to 4.6Ghz:

Straight C edition on MSVC: 4.8052e+08 32-bit to 64-bit interleaves/sec which is 2.08108ns/interleave (9.57 cycles) Straight C edition on GCC: 3.39213e+08 32-bit to 64-bit interleaves/sec which is 2.948ns/interleave (13.56 cycles)

99 {
100#if 0 // defined(__AVX__) || defined(__SSE4_1__) || defined(__SSSE3__)
101 /* https://lemire.me/blog/2018/01/09/how-fast-can-you-bit-interleave-32-bit-integers-simd-edition/
102 says that AVX is considerably faster than the SSSE3 bit interleave if you need to interleave two 128
103 bit values into a 256 bit value, but we don't support that here yet.
104
105 PDEP has a 19 cycle latency on most AMD CPUs, the C fallback is considerably quicker.
106 */
107#else
108 // Standard C fallback which processes both halves in parallel to leverage superscalar execution
109 R ret1 = R(a), ret2 = R(b);
110 constexpr R mask16 = R(0x0000ffff0000ffff), mask8 = R(0x00ff00ff00ff00ff) /* 0000 0000 1111 1111 */,
111 mask4 = R(0x0f0f0f0f0f0f0f0f) /* 0000 1111 */, mask2 = R(0x3333333333333333) /* 0011 0011 */,
112 mask1 = R(0x5555555555555555) /* 0101 0101 */;
113 if(sizeof(T) >= 4)
114 {
115 ret1 = (ret1 ^ (ret1 << 16)) & mask16;
116 ret2 = (ret2 ^ (ret2 << 16)) & mask16;
117 }
118 if(sizeof(T) >= 2)
119 {
120 ret1 = (ret1 ^ (ret1 << 8)) & mask8;
121 ret2 = (ret2 ^ (ret2 << 8)) & mask8;
122 }
123 ret1 = (ret1 ^ (ret1 << 4)) & mask4;
124 ret2 = (ret2 ^ (ret2 << 4)) & mask4;
125 ret1 = (ret1 ^ (ret1 << 2)) & mask2;
126 ret2 = (ret2 ^ (ret2 << 2)) & mask2;
127 ret1 = (ret1 ^ (ret1 << 1)) & mask1;
128 ret2 = (ret2 ^ (ret2 << 1)) & mask1;
129 return ret1 | (ret2 << 1);
130#endif
131 }

◆ bit_deinterleave()

template<class T , class R = typename detail::next_smaller<T>::type, typename std::enable_if<(std::is_unsigned< T >::value), bool >::type = true>
bit_deinterleave_result< R > quickcpplib::_xxx::algorithm::bit_interleave::bit_deinterleave ( x)
inlinenoexcept

Deinterleaves the bits in X into evens and odds.

142 {
143 constexpr T /* mask32 = T(0x00000000ffffffff), */ mask16 = T(0x0000ffff0000ffff),
144 mask8 = T(0x00ff00ff00ff00ff) /* 0000 0000 1111 1111 */,
145 mask4 = T(0x0f0f0f0f0f0f0f0f) /* 0000 1111 */, mask2 = T(0x3333333333333333) /* 0011 0011 */,
146 mask1 = T(0x5555555555555555) /* 0101 0101 */;
147 T ret1 = x & mask1, ret2 = (x >> 1) & mask1;
148 ret1 = (ret1 ^ (ret1 >> 1)) & mask2;
149 ret2 = (ret2 ^ (ret2 >> 1)) & mask2;
150 ret1 = (ret1 ^ (ret1 >> 2)) & mask4;
151 ret2 = (ret2 ^ (ret2 >> 2)) & mask4;
152 if(sizeof(T) >= 2)
153 {
154 ret1 = (ret1 ^ (ret1 >> 4)) & mask8;
155 ret2 = (ret2 ^ (ret2 >> 4)) & mask8;
156 }
157 if(sizeof(T) >= 4)
158 {
159 ret1 = (ret1 ^ (ret1 >> 8)) & mask16;
160 ret2 = (ret2 ^ (ret2 >> 8)) & mask16;
161 }
162 if(sizeof(T) >= 8)
163 {
164 ret1 = (ret1 ^ (ret1 >> 16)) /*& mask32*/;
165 ret2 = (ret2 ^ (ret2 >> 16)) /*& mask32*/;
166 }
167 return bit_deinterleave_result<R>{R(ret1), R(ret2)};
168 }