File indexing completed on 2024-06-09 04:23:31
0001 /* 0002 * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy@amyspark.me> 0003 * 0004 * SPDX-License-Identifier: BSD-3-Clause 0005 */ 0006 0007 #ifndef KO_RGBA_INTERLEAVERS 0008 #define KO_RGBA_INTERLEAVERS 0009 0010 #include <xsimd_extensions/xsimd.hpp> 0011 0012 #if XSIMD_VERSION_MAJOR >= 10 0013 #error "The interleavers use per-lane zipping semantics, which are not compatible with xsimd 10" 0014 #endif 0015 0016 using namespace xsimd; 0017 0018 template<typename T, size_t S> 0019 using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type; 0020 0021 template<typename T, size_t S> 0022 using enable_sized_integral_t = 0023 typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, 0024 int>::type; 0025 0026 template<typename T, typename A, size_t S> 0027 using enable_sized_vector_t = typename std::enable_if<batch<T, A>::size == S, int>::type; 0028 0029 #if XSIMD_WITH_AVX2 0030 template<typename A> 0031 inline batch<float, A> exchange_mid_halves(batch<float, A> const &a, kernel::requires_arch<avx2>) noexcept 0032 { 0033 return _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(a.data), 0xD8)); 0034 } 0035 0036 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0> 0037 inline batch<T, A> exchange_mid_halves(batch<T, A> const &a, kernel::requires_arch<avx2>) noexcept 0038 { 0039 return _mm256_permute4x64_epi64(a.data, 0xD8); 0040 } 0041 #endif 0042 0043 #if XSIMD_WITH_AVX 0044 template<typename A> 0045 inline batch<float, A> merge_low(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept 0046 { 0047 return _mm256_insertf128_ps(a, _mm256_castps256_ps128(b), 1); 0048 } 0049 0050 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0> 0051 inline batch<T, A> merge_low(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept 0052 { 0053 return _mm256_insertf128_si256(a, _mm256_castsi256_si128(b), 1); 0054 } 0055 0056 template<typename A> 0057 inline batch<float, A> merge_high(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept 0058 { 0059 return _mm256_permute2f128_ps(a, b, 0x31); 0060 } 0061 0062 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0> 0063 inline batch<T, A> merge_high(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept 0064 { 0065 return _mm256_permute2f128_si256(a, b, 0x31); 0066 } 0067 0068 template<typename A> 0069 inline batch<float, A> duplicate_low_halves(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept 0070 { 0071 return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); 0072 } 0073 0074 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0> 0075 inline batch<T, A> duplicate_low_halves(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept 0076 { 0077 return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(2, 0, 2, 0))); 0078 } 0079 0080 template<typename A> 0081 inline batch<float, A> duplicate_high_halves(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept 0082 { 0083 return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); 0084 } 0085 0086 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0> 0087 inline batch<T, A> duplicate_high_halves(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept 0088 { 0089 return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(3, 1, 3, 1))); 0090 } 0091 #endif 0092 0093 template<size_t N> 0094 struct KoRgbaInterleavers; 0095 0096 template<> 0097 struct KoRgbaInterleavers<16> { 0098 template<bool aligned, typename T, typename A, enable_sized_integral_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0> 0099 static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<generic>) 0100 { 0101 auto *dstPtr = static_cast<T *>(dst); 0102 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0103 const auto t1 = zip_lo(a, b); 0104 const auto t2 = zip_hi(a, b); 0105 t1.store(dstPtr, U{}); 0106 t2.store(dstPtr + batch<T, A>::size, U{}); 0107 } 0108 0109 // The AVX versions are handmade ports of the ones generated 0110 // by Clang 14.0.0: https://godbolt.org/z/Ts8MWosW3 0111 // Except for interleave(avx) which comes from GCC 11.2 0112 0113 #if XSIMD_WITH_AVX 0114 template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0> 0115 static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) 0116 { 0117 auto *dstPtr = static_cast<T *>(dst); 0118 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0119 const auto t1 = zip_lo(a, b); 0120 const auto t2 = zip_hi(a, b); 0121 const auto src1 = merge_low(t1, t2, A{}); 0122 const auto src2 = merge_high(t1, t2, A{}); 0123 src1.store(dstPtr, U{}); 0124 src2.store(dstPtr + batch<T, A>::size, U{}); 0125 } 0126 #endif 0127 0128 template<typename T, typename A, bool aligned = false> 0129 static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b) 0130 { 0131 return interleave<aligned>(dst, a, b, A{}); 0132 } 0133 0134 template<bool aligned, typename T, typename A, enable_sized_integral_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0> 0135 static inline void deinterleave(const void *src, batch<T, A> &dst1, batch<T, A> &dst2, kernel::requires_arch<generic>) 0136 { 0137 const auto *srcPtr = static_cast<const T *>(src); 0138 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0139 0140 const auto a = batch<T, A>::load(srcPtr, U{}); 0141 const auto b = batch<T, A>::load(srcPtr + batch<T, A>::size, U{}); 0142 const auto t1 = zip_lo(a, b); 0143 const auto t2 = zip_hi(a, b); 0144 dst1 = zip_lo(t1, t2); 0145 dst2 = zip_hi(t1, t2); 0146 } 0147 0148 #if XSIMD_WITH_AVX2 0149 template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0> 0150 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, kernel::requires_arch<avx2>) 0151 { 0152 const auto *srcPtr = static_cast<const T *>(src); 0153 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0154 const auto src1 = batch<T, A>::load(srcPtr, U{}); 0155 const auto src2 = batch<T, A>::load(srcPtr + batch<T, A>::size, U{}); 0156 const auto t1 = duplicate_low_halves(src1, src2, A{}); 0157 a = exchange_mid_halves(t1, A{}); 0158 const auto t2 = duplicate_high_halves(src1, src2, A{}); 0159 b = exchange_mid_halves(t2, A{}); 0160 } 0161 #endif 0162 #if XSIMD_WITH_AVX 0163 template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0> 0164 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, kernel::requires_arch<avx>) 0165 { 0166 const auto *srcPtr = static_cast<const T *>(src); 0167 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0168 const auto src1 = batch<T, A>::load(srcPtr, U{}); 0169 const auto src2 = 0170 batch<T, A>::load(srcPtr + batch<T, A>::size, U{}); 0171 const auto t1 = merge_high(src1, src2, A{}); 0172 const auto t2 = merge_low(src1, src2, A{}); 0173 a.data = duplicate_low_halves(t2, t1, A{}); 0174 b.data = duplicate_high_halves(t2, t1, A{}); 0175 } 0176 #endif 0177 0178 template<typename T, typename A, bool aligned = false> 0179 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b) 0180 { 0181 return deinterleave<aligned>(src, a, b, A{}); 0182 } 0183 }; 0184 0185 template<> 0186 struct KoRgbaInterleavers<32> { 0187 template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0> 0188 static inline void 0189 interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d, kernel::requires_arch<generic>) 0190 { 0191 auto *dstPtr = static_cast<T *>(dst); 0192 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0193 0194 const auto t1 = zip_lo(a, c); 0195 const auto t2 = zip_hi(a, c); 0196 const auto t3 = zip_lo(b, d); 0197 const auto t4 = zip_hi(b, d); 0198 const auto src1 = zip_lo(t1, t3); 0199 const auto src2 = zip_hi(t1, t3); 0200 const auto src3 = zip_lo(t2, t4); 0201 const auto src4 = zip_hi(t2, t4); 0202 src1.store(dstPtr, U{}); 0203 src2.store(dstPtr + batch<T, A>::size, U{}); 0204 src3.store(dstPtr + batch<T, A>::size * 2, U{}); 0205 src4.store(dstPtr + batch<T, A>::size * 3, U{}); 0206 } 0207 0208 #if XSIMD_WITH_AVX 0209 template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0> 0210 static inline void 0211 interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d, kernel::requires_arch<avx>) 0212 { 0213 auto *dstPtr = static_cast<T *>(dst); 0214 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0215 0216 const auto t1 = zip_lo(a, c); 0217 const auto t2 = zip_lo(b, d); 0218 const auto t3 = zip_hi(a, c); 0219 const auto t4 = zip_hi(b, d); 0220 const auto t5 = zip_lo(t1, t2); 0221 const auto t6 = zip_hi(t1, t2); 0222 const auto t7 = zip_lo(t3, t4); 0223 const auto t8 = zip_hi(t3, t4); 0224 const auto src1 = merge_low(t5, t6, A{}); 0225 const auto src2 = merge_low(t7, t8, A{}); 0226 const auto src3 = merge_high(t5, t6, A{}); 0227 const auto src4 = merge_high(t7, t8, A{}); 0228 src1.store(dstPtr, U{}); 0229 src2.store(dstPtr + batch<T, A>::size, U{}); 0230 src3.store(dstPtr + batch<T, A>::size * 2, U{}); 0231 src4.store(dstPtr + batch<T, A>::size * 3, U{}); 0232 } 0233 #endif 0234 0235 template<typename T, typename A, bool aligned = false> 0236 static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d) 0237 { 0238 return interleave<T, A, aligned>(dst, a, b, c, d, A{}); 0239 } 0240 0241 template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0> 0242 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d, kernel::requires_arch<generic>) 0243 { 0244 const auto *srcPtr = static_cast<const T *>(src); 0245 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0246 0247 const auto t1 = batch<T, A>::load(srcPtr, U{}); 0248 const auto t2 = batch<T, A>::load(srcPtr + batch<T, A>::size, U{}); 0249 const auto t3 = batch<T, A>::load(srcPtr + batch<T, A>::size * 2, U{}); 0250 const auto t4 = batch<T, A>::load(srcPtr + batch<T, A>::size * 3, U{}); 0251 const auto src1 = zip_lo(t1, t3); 0252 const auto src2 = zip_hi(t1, t3); 0253 const auto src3 = zip_lo(t2, t4); 0254 const auto src4 = zip_hi(t2, t4); 0255 a = zip_lo(src1, src3); 0256 b = zip_hi(src1, src3); 0257 c = zip_lo(src2, src4); 0258 d = zip_hi(src2, src4); 0259 } 0260 0261 #if XSIMD_WITH_AVX 0262 template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0> 0263 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d, kernel::requires_arch<avx>) 0264 { 0265 const auto *srcPtr = static_cast<const T *>(src); 0266 using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>; 0267 0268 const auto a0b0c0d0_a1b1c1d1 = batch<T, A>::load(srcPtr, U{}); 0269 const auto a2b2c2d2_a3b3c3d3 = 0270 batch<T, A>::load(srcPtr + batch<T, A>::size, U{}); 0271 const auto a4b4c4d4_a5b5c5d5 = 0272 batch<T, A>::load(srcPtr + batch<T, A>::size * 2, U{}); 0273 const auto a6b6c6d6_a7b7c7d7 = 0274 batch<T, A>::load(srcPtr + batch<T, A>::size * 3, U{}); 0275 0276 const auto a0a2b0b2_a1a3b1b3 = 0277 zip_lo(a0b0c0d0_a1b1c1d1, a2b2c2d2_a3b3c3d3); 0278 const auto c0c2d0d2_c1c3d1d3 = 0279 zip_hi(a0b0c0d0_a1b1c1d1, a2b2c2d2_a3b3c3d3); 0280 const auto a0a2b0b2_c0c2d0d2 = 0281 merge_low(a0a2b0b2_a1a3b1b3, c0c2d0d2_c1c3d1d3, A{}); 0282 const auto a1a3b1b3_c1c3d1d3 = 0283 merge_high(a0a2b0b2_a1a3b1b3, c0c2d0d2_c1c3d1d3, A{}); 0284 const auto a0a1a2a3_c0c1c2c3 = 0285 zip_lo(a0a2b0b2_c0c2d0d2, a1a3b1b3_c1c3d1d3); 0286 const auto b0b1b2b3_d0d1d2d3 = 0287 zip_hi(a0a2b0b2_c0c2d0d2, a1a3b1b3_c1c3d1d3); 0288 0289 const auto a4a6b4b6_a5a7b5b7 = 0290 zip_lo(a4b4c4d4_a5b5c5d5, a6b6c6d6_a7b7c7d7); 0291 const auto c4c6d4d6_c5c7d5d7 = 0292 zip_hi(a4b4c4d4_a5b5c5d5, a6b6c6d6_a7b7c7d7); 0293 const auto a4a6b4b6_c4c6d4d6 = 0294 merge_low(a4a6b4b6_a5a7b5b7, c4c6d4d6_c5c7d5d7, A{}); 0295 const auto a5a7b5b7_c5c7d5d7 = 0296 merge_high(a4a6b4b6_a5a7b5b7, c4c6d4d6_c5c7d5d7, A{}); 0297 const auto a4a5a6a7_c4c5c6c7 = 0298 zip_lo(a4a6b4b6_c4c6d4d6, a5a7b5b7_c5c7d5d7); 0299 const auto b4b5b6b7_d4d5d6d7 = 0300 zip_hi(a4a6b4b6_c4c6d4d6, a5a7b5b7_c5c7d5d7); 0301 0302 a = merge_low(a0a1a2a3_c0c1c2c3, a4a5a6a7_c4c5c6c7, A{}); 0303 b = merge_low(b0b1b2b3_d0d1d2d3, b4b5b6b7_d4d5d6d7, A{}); 0304 c = merge_high(a0a1a2a3_c0c1c2c3, a4a5a6a7_c4c5c6c7, A{}); 0305 d = merge_high(b0b1b2b3_d0d1d2d3, b4b5b6b7_d4d5d6d7, A{}); 0306 } 0307 #endif 0308 0309 template<typename T, typename A, bool aligned = false> 0310 static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d) 0311 { 0312 return deinterleave<T, A, aligned>(src, a, b, c, d, A{}); 0313 } 0314 }; 0315 0316 #endif // KO_RGBA_INTERLEAVERS