File indexing completed on 2024-06-09 04:23:31

0001 /*
0002  * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy@amyspark.me>
0003  *
0004  * SPDX-License-Identifier: BSD-3-Clause
0005  */
0006 
0007 #ifndef KO_RGBA_INTERLEAVERS
0008 #define KO_RGBA_INTERLEAVERS
0009 
0010 #include <xsimd_extensions/xsimd.hpp>
0011 
0012 #if XSIMD_VERSION_MAJOR >= 10
0013 #error "The interleavers use per-lane zipping semantics, which are not compatible with xsimd 10"
0014 #endif
0015 
0016 using namespace xsimd;
0017 
0018 template<typename T, size_t S>
0019 using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
0020 
0021 template<typename T, size_t S>
0022 using enable_sized_integral_t =
0023     typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S,
0024                             int>::type;
0025 
0026 template<typename T, typename A, size_t S>
0027 using enable_sized_vector_t = typename std::enable_if<batch<T, A>::size == S, int>::type;
0028 
0029 #if XSIMD_WITH_AVX2
0030 template<typename A>
0031 inline batch<float, A> exchange_mid_halves(batch<float, A> const &a, kernel::requires_arch<avx2>) noexcept
0032 {
0033     return _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(a.data), 0xD8));
0034 }
0035 
0036 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
0037 inline batch<T, A> exchange_mid_halves(batch<T, A> const &a, kernel::requires_arch<avx2>) noexcept
0038 {
0039     return _mm256_permute4x64_epi64(a.data, 0xD8);
0040 }
0041 #endif
0042 
0043 #if XSIMD_WITH_AVX
0044 template<typename A>
0045 inline batch<float, A> merge_low(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept
0046 {
0047     return _mm256_insertf128_ps(a, _mm256_castps256_ps128(b), 1);
0048 }
0049 
0050 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
0051 inline batch<T, A> merge_low(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept
0052 {
0053     return _mm256_insertf128_si256(a, _mm256_castsi256_si128(b), 1);
0054 }
0055 
0056 template<typename A>
0057 inline batch<float, A> merge_high(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept
0058 {
0059     return _mm256_permute2f128_ps(a, b, 0x31);
0060 }
0061 
0062 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
0063 inline batch<T, A> merge_high(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept
0064 {
0065     return _mm256_permute2f128_si256(a, b, 0x31);
0066 }
0067 
0068 template<typename A>
0069 inline batch<float, A> duplicate_low_halves(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept
0070 {
0071     return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
0072 }
0073 
0074 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
0075 inline batch<T, A> duplicate_low_halves(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept
0076 {
0077     return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
0078 }
0079 
0080 template<typename A>
0081 inline batch<float, A> duplicate_high_halves(batch<float, A> const &a, batch<float, A> const &b, kernel::requires_arch<avx>) noexcept
0082 {
0083     return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
0084 }
0085 
0086 template<typename T, typename A, enable_sized_integral_t<T, 4> = 0>
0087 inline batch<T, A> duplicate_high_halves(batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>) noexcept
0088 {
0089     return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
0090 }
0091 #endif
0092 
0093 template<size_t N>
0094 struct KoRgbaInterleavers;
0095 
0096 template<>
0097 struct KoRgbaInterleavers<16> {
0098     template<bool aligned, typename T, typename A, enable_sized_integral_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
0099     static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<generic>)
0100     {
0101         auto *dstPtr = static_cast<T *>(dst);
0102         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0103         const auto t1 = zip_lo(a, b);
0104         const auto t2 = zip_hi(a, b);
0105         t1.store(dstPtr, U{});
0106         t2.store(dstPtr + batch<T, A>::size, U{});
0107     }
0108 
0109     // The AVX versions are handmade ports of the ones generated
0110     // by Clang 14.0.0: https://godbolt.org/z/Ts8MWosW3
0111     // Except for interleave(avx) which comes from GCC 11.2
0112 
0113 #if XSIMD_WITH_AVX
0114     template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0>
0115     static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, kernel::requires_arch<avx>)
0116     {
0117         auto *dstPtr = static_cast<T *>(dst);
0118         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0119         const auto t1 = zip_lo(a, b);
0120         const auto t2 = zip_hi(a, b);
0121         const auto src1 = merge_low(t1, t2, A{});
0122         const auto src2 = merge_high(t1, t2, A{});
0123         src1.store(dstPtr, U{});
0124         src2.store(dstPtr + batch<T, A>::size, U{});
0125     }
0126 #endif
0127 
0128     template<typename T, typename A, bool aligned = false>
0129     static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b)
0130     {
0131         return interleave<aligned>(dst, a, b, A{});
0132     }
0133 
0134     template<bool aligned, typename T, typename A, enable_sized_integral_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
0135     static inline void deinterleave(const void *src, batch<T, A> &dst1, batch<T, A> &dst2, kernel::requires_arch<generic>)
0136     {
0137         const auto *srcPtr = static_cast<const T *>(src);
0138         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0139 
0140         const auto a = batch<T, A>::load(srcPtr, U{});
0141         const auto b = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
0142         const auto t1 = zip_lo(a, b);
0143         const auto t2 = zip_hi(a, b);
0144         dst1 = zip_lo(t1, t2);
0145         dst2 = zip_hi(t1, t2);
0146     }
0147 
0148 #if XSIMD_WITH_AVX2
0149     template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0>
0150     static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, kernel::requires_arch<avx2>)
0151     {
0152         const auto *srcPtr = static_cast<const T *>(src);
0153         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0154         const auto src1 = batch<T, A>::load(srcPtr, U{});
0155         const auto src2 = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
0156         const auto t1 = duplicate_low_halves(src1, src2, A{});
0157         a = exchange_mid_halves(t1, A{});
0158         const auto t2 = duplicate_high_halves(src1, src2, A{});
0159         b = exchange_mid_halves(t2, A{});
0160     }
0161 #endif
0162 #if XSIMD_WITH_AVX
0163     template<bool aligned, typename T, typename A, enable_sized_t<T, 4> = 0>
0164     static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, kernel::requires_arch<avx>)
0165     {
0166         const auto *srcPtr = static_cast<const T *>(src);
0167         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0168         const auto src1 = batch<T, A>::load(srcPtr, U{});
0169         const auto src2 =
0170             batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
0171         const auto t1 = merge_high(src1, src2, A{});
0172         const auto t2 = merge_low(src1, src2, A{});
0173         a.data = duplicate_low_halves(t2, t1, A{});
0174         b.data = duplicate_high_halves(t2, t1, A{});
0175     }
0176 #endif
0177 
0178     template<typename T, typename A, bool aligned = false>
0179     static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b)
0180     {
0181         return deinterleave<aligned>(src, a, b, A{});
0182     }
0183 };
0184 
0185 template<>
0186 struct KoRgbaInterleavers<32> {
0187     template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
0188     static inline void
0189     interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d, kernel::requires_arch<generic>)
0190     {
0191         auto *dstPtr = static_cast<T *>(dst);
0192         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0193 
0194         const auto t1 = zip_lo(a, c);
0195         const auto t2 = zip_hi(a, c);
0196         const auto t3 = zip_lo(b, d);
0197         const auto t4 = zip_hi(b, d);
0198         const auto src1 = zip_lo(t1, t3);
0199         const auto src2 = zip_hi(t1, t3);
0200         const auto src3 = zip_lo(t2, t4);
0201         const auto src4 = zip_hi(t2, t4);
0202         src1.store(dstPtr, U{});
0203         src2.store(dstPtr + batch<T, A>::size, U{});
0204         src3.store(dstPtr + batch<T, A>::size * 2, U{});
0205         src4.store(dstPtr + batch<T, A>::size * 3, U{});
0206     }
0207 
0208 #if XSIMD_WITH_AVX
0209     template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0>
0210     static inline void
0211     interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d, kernel::requires_arch<avx>)
0212     {
0213         auto *dstPtr = static_cast<T *>(dst);
0214         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0215 
0216         const auto t1 = zip_lo(a, c);
0217         const auto t2 = zip_lo(b, d);
0218         const auto t3 = zip_hi(a, c);
0219         const auto t4 = zip_hi(b, d);
0220         const auto t5 = zip_lo(t1, t2);
0221         const auto t6 = zip_hi(t1, t2);
0222         const auto t7 = zip_lo(t3, t4);
0223         const auto t8 = zip_hi(t3, t4);
0224         const auto src1 = merge_low(t5, t6, A{});
0225         const auto src2 = merge_low(t7, t8, A{});
0226         const auto src3 = merge_high(t5, t6, A{});
0227         const auto src4 = merge_high(t7, t8, A{});
0228         src1.store(dstPtr, U{});
0229         src2.store(dstPtr + batch<T, A>::size, U{});
0230         src3.store(dstPtr + batch<T, A>::size * 2, U{});
0231         src4.store(dstPtr + batch<T, A>::size * 3, U{});
0232     }
0233 #endif
0234 
0235     template<typename T, typename A, bool aligned = false>
0236     static inline void interleave(void *dst, batch<T, A> const &a, batch<T, A> const &b, batch<T, A> const &c, batch<T, A> const &d)
0237     {
0238         return interleave<T, A, aligned>(dst, a, b, c, d, A{});
0239     }
0240 
0241     template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0, enable_sized_vector_t<T, A, 4> = 0>
0242     static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d, kernel::requires_arch<generic>)
0243     {
0244         const auto *srcPtr = static_cast<const T *>(src);
0245         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0246 
0247         const auto t1 = batch<T, A>::load(srcPtr, U{});
0248         const auto t2 = batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
0249         const auto t3 = batch<T, A>::load(srcPtr + batch<T, A>::size * 2, U{});
0250         const auto t4 = batch<T, A>::load(srcPtr + batch<T, A>::size * 3, U{});
0251         const auto src1 = zip_lo(t1, t3);
0252         const auto src2 = zip_hi(t1, t3);
0253         const auto src3 = zip_lo(t2, t4);
0254         const auto src4 = zip_hi(t2, t4);
0255         a = zip_lo(src1, src3);
0256         b = zip_hi(src1, src3);
0257         c = zip_lo(src2, src4);
0258         d = zip_hi(src2, src4);
0259     }
0260 
0261 #if XSIMD_WITH_AVX
0262     template<typename T, typename A, bool aligned = false, enable_sized_t<T, 4> = 0>
0263     static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d, kernel::requires_arch<avx>)
0264     {
0265         const auto *srcPtr = static_cast<const T *>(src);
0266         using U = std::conditional_t<aligned, aligned_mode, unaligned_mode>;
0267 
0268         const auto a0b0c0d0_a1b1c1d1 = batch<T, A>::load(srcPtr, U{});
0269         const auto a2b2c2d2_a3b3c3d3 =
0270             batch<T, A>::load(srcPtr + batch<T, A>::size, U{});
0271         const auto a4b4c4d4_a5b5c5d5 =
0272             batch<T, A>::load(srcPtr + batch<T, A>::size * 2, U{});
0273         const auto a6b6c6d6_a7b7c7d7 =
0274             batch<T, A>::load(srcPtr + batch<T, A>::size * 3, U{});
0275 
0276         const auto a0a2b0b2_a1a3b1b3 =
0277             zip_lo(a0b0c0d0_a1b1c1d1, a2b2c2d2_a3b3c3d3);
0278         const auto c0c2d0d2_c1c3d1d3 =
0279             zip_hi(a0b0c0d0_a1b1c1d1, a2b2c2d2_a3b3c3d3);
0280         const auto a0a2b0b2_c0c2d0d2 =
0281             merge_low(a0a2b0b2_a1a3b1b3, c0c2d0d2_c1c3d1d3, A{});
0282         const auto a1a3b1b3_c1c3d1d3 =
0283             merge_high(a0a2b0b2_a1a3b1b3, c0c2d0d2_c1c3d1d3, A{});
0284         const auto a0a1a2a3_c0c1c2c3 =
0285             zip_lo(a0a2b0b2_c0c2d0d2, a1a3b1b3_c1c3d1d3);
0286         const auto b0b1b2b3_d0d1d2d3 =
0287             zip_hi(a0a2b0b2_c0c2d0d2, a1a3b1b3_c1c3d1d3);
0288 
0289         const auto a4a6b4b6_a5a7b5b7 =
0290             zip_lo(a4b4c4d4_a5b5c5d5, a6b6c6d6_a7b7c7d7);
0291         const auto c4c6d4d6_c5c7d5d7 =
0292             zip_hi(a4b4c4d4_a5b5c5d5, a6b6c6d6_a7b7c7d7);
0293         const auto a4a6b4b6_c4c6d4d6 =
0294             merge_low(a4a6b4b6_a5a7b5b7, c4c6d4d6_c5c7d5d7, A{});
0295         const auto a5a7b5b7_c5c7d5d7 =
0296             merge_high(a4a6b4b6_a5a7b5b7, c4c6d4d6_c5c7d5d7, A{});
0297         const auto a4a5a6a7_c4c5c6c7 =
0298             zip_lo(a4a6b4b6_c4c6d4d6, a5a7b5b7_c5c7d5d7);
0299         const auto b4b5b6b7_d4d5d6d7 =
0300             zip_hi(a4a6b4b6_c4c6d4d6, a5a7b5b7_c5c7d5d7);
0301 
0302         a = merge_low(a0a1a2a3_c0c1c2c3, a4a5a6a7_c4c5c6c7, A{});
0303         b = merge_low(b0b1b2b3_d0d1d2d3, b4b5b6b7_d4d5d6d7, A{});
0304         c = merge_high(a0a1a2a3_c0c1c2c3, a4a5a6a7_c4c5c6c7, A{});
0305         d = merge_high(b0b1b2b3_d0d1d2d3, b4b5b6b7_d4d5d6d7, A{});
0306     }
0307 #endif
0308 
0309     template<typename T, typename A, bool aligned = false>
0310     static inline void deinterleave(const void *src, batch<T, A> &a, batch<T, A> &b, batch<T, A> &c, batch<T, A> &d)
0311     {
0312         return deinterleave<T, A, aligned>(src, a, b, c, d, A{});
0313     }
0314 };
0315 
0316 #endif // KO_RGBA_INTERLEAVERS