File indexing completed on 2024-06-09 04:23:31

0001 /*
0002  *  SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73@gmail.com>
0003  *  SPDX-FileCopyrightText: 2020 Mathias Wein <lynx.mw+kde@gmail.com>
0004  *  SPDX-FileCopyrightText: 2022 L. E. Segovia <amy@amyspark.me>
0005  *
0006  * SPDX-License-Identifier: LGPL-2.1-or-later
0007  */
0008 
0009 #ifndef __KOSTREAMED_MATH_H
0010 #define __KOSTREAMED_MATH_H
0011 
0012 #if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
0013 #error "Trying to use SIMD with an unknown architecture!"
0014 #endif
0015 
0016 #include <cstdint>
0017 #include <cstring>
0018 #include <iostream>
0019 #include <type_traits>
0020 #include <xsimd_extensions/xsimd.hpp>
0021 
0022 #if XSIMD_VERSION_MAJOR < 10
0023 #include <KoRgbaInterleavers.h>
0024 #endif
0025 
0026 #include <KoAlwaysInline.h>
0027 #include <KoColorSpaceMaths.h>
0028 #include <KoCompositeOp.h>
0029 
0030 #define BLOCKDEBUG 0
0031 
0032 template<typename _impl, typename result_type>
0033 struct OptiRound {
0034     ALWAYS_INLINE static result_type roundScalar(const float value)
0035     {
0036 #ifdef __SSE__
0037         // SSE/AVX instructions use round-to-even rounding rule so we
0038         // should reuse it when possible
0039         return _mm_cvtss_si32(_mm_set_ss(value));
0040 #elif XSIMD_WITH_NEON64
0041         return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vdupq_n_f32(value))),
0042                               0);
0043 #elif XSIMD_WITH_NEON
0044         /* origin:
0045          * https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047
0046          */
0047         //  Contributors to this work are:
0048         //   John W. Ratcliff <jratcliffscarab@gmail.com>
0049         //   Brandon Rowlett <browlett@nvidia.com>
0050         //   Ken Fast <kfast@gdeb.com>
0051         //   Eric van Beurden <evanbeurden@nvidia.com>
0052         //   Alexander Potylitsin <apotylitsin@nvidia.com>
0053         //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
0054         //   Jim Huang <jserv@biilabs.io>
0055         //   Mark Cheng <marktwtn@biilabs.io>
0056         //   Malcolm James MacLeod <malcolm@gulden.com>
0057         //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
0058         //   Sebastian Pop <spop@amazon.com>
0059         //   Developer Ecosystem Engineering
0060         //   <DeveloperEcosystemEngineering@apple.com> Danila Kutenin
0061         //   <danilak@google.com> François Turban (JishinMaster)
0062         //   <francois.turban@gmail.com> Pei-Hsuan Hung <afcidk@gmail.com>
0063         //   Yang-Hao Yuan <yanghau@biilabs.io>
0064         //   Syoyo Fujita <syoyo@lighttransport.com>
0065         //   Brecht Van Lommel <brecht@blender.org>
0066 
0067         /*
0068          * sse2neon is freely redistributable under the MIT License.
0069          *
0070          * Permission is hereby granted, free of charge, to any person obtaining
0071          * a copy of this software and associated documentation files (the
0072          * "Software"), to deal in the Software without restriction, including
0073          * without limitation the rights to use, copy, modify, merge, publish,
0074          * distribute, sublicense, and/or sell copies of the Software, and to
0075          * permit persons to whom the Software is furnished to do so, subject to
0076          * the following conditions:
0077          *
0078          * The above copyright notice and this permission notice shall be
0079          * included in all copies or substantial portions of the Software.
0080          *
0081          * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0082          * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0083          * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0084          * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0085          * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0086          * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0087          * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0088          * SOFTWARE.
0089          */
0090         const auto nearbyint_as_int = [](const float v) {
0091             const auto a = vdupq_n_f32(v);
0092             const auto signmask = vdupq_n_u32(0x80000000);
0093             const auto half =
0094                 vbslq_f32(signmask, a, vdupq_n_f32(0.5f)); /* +/- 0.5 */
0095             const auto r_normal = vcvtq_s32_f32(
0096                 vaddq_f32(a, half)); /* round to integer: [a + 0.5]*/
0097             const auto r_trunc =
0098                 vcvtq_s32_f32(a); /* truncate to integer: [a] */
0099             const auto plusone = vreinterpretq_s32_u32(
0100                 vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)),
0101                             31)); /* 1 or 0 */
0102             const auto r_even =
0103                 vbicq_s32(vaddq_s32(r_trunc, plusone),
0104                           vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
0105             const auto delta = vsubq_f32(
0106                 a,
0107                 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
0108             const auto is_delta_half =
0109                 vceqq_f32(delta, half); /* delta == +/- 0.5 */
0110             return vbslq_s32(is_delta_half, r_even, r_normal);
0111         };
0112         return vgetq_lane_s32(nearbyint_as_int(value), 0);
0113 #else
0114         return std::lroundf(value);
0115 #endif
0116     }
0117 };
0118 
0119 template<typename _impl>
0120 struct OptiDiv {
0121     using float_v = xsimd::batch<float, _impl>;
0122 
0123     ALWAYS_INLINE static float divScalar(const float &divident, const float &divisor)
0124     {
0125 #ifdef __SSE__
0126         float result = NAN;
0127 
0128         __m128 x = _mm_set_ss(divisor);
0129         __m128 y = _mm_set_ss(divident);
0130         x = _mm_rcp_ss(x);
0131         x = _mm_mul_ss(x, y);
0132 
0133         _mm_store_ss(&result, x);
0134         return result;
0135 #elif defined __ARM_NEON
0136         auto x = vdupq_n_f32(divisor);
0137         auto y = vdupq_n_f32(divident);
0138         x = vrecpeq_f32(x);
0139         x = vmulq_f32(x, y);
0140 
0141         return vgetq_lane_f32(x, 0);
0142 #else
0143         return (1.f / divisor) * divident;
0144 #endif
0145     }
0146 
0147     ALWAYS_INLINE static float_v divVector(const float_v &divident, const float_v &divisor)
0148     {
0149         return divident * xsimd::reciprocal(divisor);
0150     }
0151 };
0152 
0153 template<typename _impl>
0154 struct KoStreamedMath {
0155     using int_v = xsimd::batch<int, _impl>;
0156     using uint_v = xsimd::batch<unsigned int, _impl>;
0157     using float_v = xsimd::batch<float, _impl>;
0158 
0159     static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
0160     static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
0161 
0162     /**
0163      * Composes src into dst without using vector instructions
0164      */
0165     template<bool useMask, bool useFlow, class Compositor, int pixelSize>
0166     static void genericComposite_novector(const KoCompositeOp::ParameterInfo &params)
0167     {
0168         using namespace Arithmetic;
0169 
0170         const qint32 linearInc = pixelSize;
0171         qint32 srcLinearInc = params.srcRowStride ? pixelSize : 0;
0172 
0173         quint8 *dstRowStart = params.dstRowStart;
0174         const quint8 *maskRowStart = params.maskRowStart;
0175         const quint8 *srcRowStart = params.srcRowStart;
0176         typename Compositor::ParamsWrapper paramsWrapper(params);
0177 
0178         for (qint32 r = params.rows; r > 0; --r) {
0179             const quint8 *mask = maskRowStart;
0180             const quint8 *src = srcRowStart;
0181             quint8 *dst = dstRowStart;
0182 
0183             int blockRest = params.cols;
0184 
0185             for (int i = 0; i < blockRest; i++) {
0186                 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
0187                                                                              dst,
0188                                                                              mask,
0189                                                                              params.opacity,
0190                                                                              paramsWrapper);
0191                 src += srcLinearInc;
0192                 dst += linearInc;
0193 
0194                 if (useMask) {
0195                     mask++;
0196                 }
0197             }
0198 
0199             srcRowStart += params.srcRowStride;
0200             dstRowStart += params.dstRowStride;
0201 
0202             if (useMask) {
0203                 maskRowStart += params.maskRowStride;
0204             }
0205         }
0206     }
0207 
0208     template<bool useMask, bool useFlow, class Compositor>
0209     static void genericComposite32_novector(const KoCompositeOp::ParameterInfo &params)
0210     {
0211         genericComposite_novector<useMask, useFlow, Compositor, 4>(params);
0212     }
0213 
0214     template<bool useMask, bool useFlow, class Compositor>
0215     static void genericComposite128_novector(const KoCompositeOp::ParameterInfo &params)
0216     {
0217         genericComposite_novector<useMask, useFlow, Compositor, 16>(params);
0218     }
0219 
0220     template<bool useMask, bool useFlow, class Compositor>
0221     static void genericComposite64_novector(const KoCompositeOp::ParameterInfo &params)
0222     {
0223         genericComposite_novector<useMask, useFlow, Compositor, 8>(params);
0224     }
0225 
0226     static inline quint8 round_float_to_u8(float x)
0227     {
0228         return OptiRound<_impl, quint8>::roundScalar(x);
0229     }
0230 
0231     static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha)
0232     {
0233         return round_float_to_u8(float(b - a) * alpha + float(a));
0234     }
0235 
0236     /**
0237      * Get a vector containing first float_v::size values of mask.
0238      * Each source mask element is considered to be a 8-bit integer
0239      */
0240     static inline float_v fetch_mask_8(const quint8 *data)
0241     {
0242         return xsimd::batch_cast<float>(xsimd::load_and_extend<int_v>(data));
0243     }
0244 
0245     /**
0246      * Get an alpha values from float_v::size pixels 32-bit each
0247      * (4 channels, 8 bit per channel).  The alpha value is considered
0248      * to be stored in the most significant byte of the pixel
0249      *
0250      * \p aligned controls whether the \p data is fetched using aligned
0251      *            instruction or not.
0252      *            1) Fetching aligned data with unaligned instruction
0253      *               degrades performance.
0254      *            2) Fetching unaligned data with aligned instruction
0255      *               causes \#GP (General Protection Exception)
0256      */
0257     template<bool aligned>
0258     static inline float_v fetch_alpha_32(const void *data)
0259     {
0260         using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
0261         const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{});
0262         return xsimd::to_float(xsimd::bitwise_cast_compat<int>(data_i >> 24));
0263     }
0264 
0265     /**
0266      * Get color values from float_v::size pixels 32-bit each
0267      * (4 channels, 8 bit per channel).  The color data is considered
0268      * to be stored in the 3 least significant bytes of the pixel.
0269      *
0270      * \p aligned controls whether the \p data is fetched using aligned
0271      *            instruction or not.
0272      *            1) Fetching aligned data with unaligned instruction
0273      *               degrades performance.
0274      *            2) Fetching unaligned data with aligned instruction
0275      *               causes \#GP (General Protection Exception)
0276      */
0277     template<bool aligned>
0278     static inline void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3)
0279     {
0280         using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type;
0281 
0282         const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{});
0283 
0284         const uint_v mask(0xFF);
0285 
0286         c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 16) & mask));
0287         c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 8) & mask));
0288         c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i) & mask));
0289     }
0290 
0291     /**
0292      * Pack color and alpha values to float_v::size pixels 32-bit each
0293      * (4 channels, 8 bit per channel).  The color data is considered
0294      * to be stored in the 3 least significant bytes of the pixel, alpha -
0295      * in the most significant byte
0296      *
0297      * NOTE: \p data must be aligned pointer!
0298      */
0299     static inline void
0300     write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
0301     {
0302         const int_v mask(0xFF);
0303 
0304         const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
0305         const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
0306         const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
0307         const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
0308         xsimd::store_aligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4));
0309     }
0310 
0311     static inline void
0312     write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3)
0313     {
0314         const int_v mask(0xFF);
0315 
0316         const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24;
0317         const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16;
0318         const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8;
0319         const auto v4 = (xsimd::nearbyint_as_int(c3) & mask);
0320         xsimd::store_unaligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4));
0321     }
0322 
0323     /**
0324      * Composes src pixels into dst pixels. Is optimized for 32-bit-per-pixel
0325      * colorspaces. Uses \p Compositor strategy parameter for doing actual
0326      * math of the composition
0327      */
0328     template<bool useMask, bool useFlow, class Compositor, int pixelSize>
0329     static void genericComposite(const KoCompositeOp::ParameterInfo &params)
0330     {
0331         using namespace Arithmetic;
0332 
0333         const int vectorSize = static_cast<int>(float_v::size);
0334         const qint32 vectorInc = pixelSize * vectorSize;
0335         const qint32 linearInc = pixelSize;
0336         qint32 srcVectorInc = vectorInc;
0337         qint32 srcLinearInc = pixelSize;
0338 
0339         quint8 *dstRowStart = params.dstRowStart;
0340         const quint8 *maskRowStart = params.maskRowStart;
0341         const quint8 *srcRowStart = params.srcRowStart;
0342         typename Compositor::ParamsWrapper paramsWrapper(params);
0343 
0344         if (!params.srcRowStride) {
0345             if (pixelSize == 4) {
0346                 auto *buf = reinterpret_cast<uint_v *>(xsimd::vector_aligned_malloc<typename uint_v::value_type>(vectorSize));
0347                 *buf = uint_v(*(reinterpret_cast<const quint32 *>(srcRowStart)));
0348                 srcRowStart = reinterpret_cast<quint8 *>(buf);
0349                 srcLinearInc = 0;
0350                 srcVectorInc = 0;
0351             } else {
0352                 auto *buf = xsimd::vector_aligned_malloc<quint8>(vectorInc);
0353                 quint8 *ptr = buf;
0354 
0355                 for (size_t i = 0; i < vectorSize; i++) {
0356                     memcpy(ptr, params.srcRowStart, pixelSize);
0357                     ptr += pixelSize;
0358                 }
0359 
0360                 srcRowStart = buf;
0361                 srcLinearInc = 0;
0362                 srcVectorInc = 0;
0363             }
0364         }
0365 #if BLOCKDEBUG
0366         int totalBlockAlign = 0;
0367         int totalBlockAlignedVector = 0;
0368         int totalBlockUnalignedVector = 0;
0369         int totalBlockRest = 0;
0370 #endif
0371 
0372         for (qint32 r = params.rows; r > 0; --r) {
0373             // Hint: Mask is allowed to be unaligned
0374             const quint8 *mask = maskRowStart;
0375 
0376             const quint8 *src = srcRowStart;
0377             quint8 *dst = dstRowStart;
0378 
0379             const int pixelsAlignmentMask = vectorSize * sizeof(float) - 1;
0380             auto srcPtrValue = reinterpret_cast<uintptr_t>(src);
0381             auto dstPtrValue = reinterpret_cast<uintptr_t>(dst);
0382             uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask;
0383             uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask;
0384 
0385             // Uncomment if facing problems with alignment:
0386             // Q_ASSERT_X(!(dstAlignment & 3), "Compositing",
0387             //            "Pixel data must be aligned on pixels borders!");
0388 
0389             int blockAlign = params.cols;
0390             int blockAlignedVector = 0;
0391             int blockUnalignedVector = 0;
0392             int blockRest = 0;
0393 
0394             int *vectorBlock =
0395                 srcAlignment == dstAlignment || !srcVectorInc ? &blockAlignedVector : &blockUnalignedVector;
0396 
0397             if (!dstAlignment) {
0398                 blockAlign = 0;
0399                 *vectorBlock = params.cols / vectorSize;
0400                 blockRest = params.cols % vectorSize;
0401             } else if (params.cols > 2 * vectorSize) {
0402                 blockAlign = (vectorInc - dstAlignment) / pixelSize;
0403                 const int restCols = params.cols - blockAlign;
0404                 if (restCols > 0) {
0405                     *vectorBlock = restCols / vectorSize;
0406                     blockRest = restCols % vectorSize;
0407                 } else {
0408                     blockAlign = params.cols;
0409                     *vectorBlock = 0;
0410                     blockRest = 0;
0411                 }
0412             }
0413 #if BLOCKDEBUG
0414             totalBlockAlign += blockAlign;
0415             totalBlockAlignedVector += blockAlignedVector;
0416             totalBlockUnalignedVector += blockUnalignedVector;
0417             totalBlockRest += blockRest;
0418 #endif
0419 
0420             for (int i = 0; i < blockAlign; i++) {
0421                 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
0422                                                                              dst,
0423                                                                              mask,
0424                                                                              params.opacity,
0425                                                                              paramsWrapper);
0426                 src += srcLinearInc;
0427                 dst += linearInc;
0428 
0429                 if (useMask) {
0430                     mask++;
0431                 }
0432             }
0433 
0434             for (int i = 0; i < blockAlignedVector; i++) {
0435                 Compositor::template compositeVector<useMask, true, _impl>(src,
0436                                                                            dst,
0437                                                                            mask,
0438                                                                            params.opacity,
0439                                                                            paramsWrapper);
0440                 src += srcVectorInc;
0441                 dst += vectorInc;
0442 
0443                 if (useMask) {
0444                     mask += vectorSize;
0445                 }
0446             }
0447 
0448             for (int i = 0; i < blockUnalignedVector; i++) {
0449                 Compositor::template compositeVector<useMask, false, _impl>(src,
0450                                                                             dst,
0451                                                                             mask,
0452                                                                             params.opacity,
0453                                                                             paramsWrapper);
0454                 src += srcVectorInc;
0455                 dst += vectorInc;
0456 
0457                 if (useMask) {
0458                     mask += vectorSize;
0459                 }
0460             }
0461 
0462             for (int i = 0; i < blockRest; i++) {
0463                 Compositor::template compositeOnePixelScalar<useMask, _impl>(src,
0464                                                                              dst,
0465                                                                              mask,
0466                                                                              params.opacity,
0467                                                                              paramsWrapper);
0468                 src += srcLinearInc;
0469                 dst += linearInc;
0470 
0471                 if (useMask) {
0472                     mask++;
0473                 }
0474             }
0475 
0476             srcRowStart += params.srcRowStride;
0477             dstRowStart += params.dstRowStride;
0478 
0479             if (useMask) {
0480                 maskRowStart += params.maskRowStride;
0481             }
0482         }
0483 
0484 #if BLOCKDEBUG
0485         dbgPigment << "I"
0486                    << "rows:" << params.rows << "\tpad(S):" << totalBlockAlign << "\tbav(V):" << totalBlockAlignedVector
0487                    << "\tbuv(V):" << totalBlockUnalignedVector << "\tres(S)"
0488                    << totalBlockRest; // << srcAlignment << dstAlignment;
0489 #endif
0490 
0491         if (!params.srcRowStride) {
0492             xsimd::vector_aligned_free(srcRowStart);
0493         }
0494     }
0495 
0496     template<bool useMask, bool useFlow, class Compositor>
0497     static void genericComposite32(const KoCompositeOp::ParameterInfo &params)
0498     {
0499         genericComposite<useMask, useFlow, Compositor, 4>(params);
0500     }
0501 
0502     template<bool useMask, bool useFlow, class Compositor>
0503     static void genericComposite128(const KoCompositeOp::ParameterInfo &params)
0504     {
0505         genericComposite<useMask, useFlow, Compositor, 16>(params);
0506     }
0507 
0508     template<bool useMask, bool useFlow, class Compositor>
0509     static void genericComposite64(const KoCompositeOp::ParameterInfo &params)
0510     {
0511         genericComposite<useMask, useFlow, Compositor, 8>(params);
0512     }
0513 };
0514 
0515 template<typename channels_type, class _impl>
0516 struct PixelStateRecoverHelper {
0517     using float_v = xsimd::batch<float, _impl>;
0518     using float_m = typename float_v::batch_bool_type;
0519 
0520     ALWAYS_INLINE
0521     PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
0522     {
0523         Q_UNUSED(c1);
0524         Q_UNUSED(c2);
0525         Q_UNUSED(c3);
0526     }
0527 
0528     ALWAYS_INLINE
0529     void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const {
0530         Q_UNUSED(mask);
0531         Q_UNUSED(c1);
0532         Q_UNUSED(c2);
0533         Q_UNUSED(c3);
0534     }
0535 };
0536 
0537 template<class _impl>
0538 struct PixelStateRecoverHelper<float, _impl> {
0539     using float_v = xsimd::batch<float, _impl>;
0540     using float_m = typename float_v::batch_bool_type;
0541 
0542     ALWAYS_INLINE
0543     PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3)
0544         : m_orig_c1(c1),
0545           m_orig_c2(c2),
0546           m_orig_c3(c3)
0547     {
0548     }
0549 
0550     ALWAYS_INLINE
0551     void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const {
0552         if (xsimd::any(mask)) {
0553             c1 = xsimd::select(mask, m_orig_c1, c1);
0554             c2 = xsimd::select(mask, m_orig_c2, c2);
0555             c3 = xsimd::select(mask, m_orig_c3, c3);
0556         }
0557     }
0558 
0559 private:
0560     const float_v m_orig_c1;
0561     const float_v m_orig_c2;
0562     const float_v m_orig_c3;
0563 };
0564 
0565 template<typename channels_type, class _impl>
0566 struct PixelWrapper
0567 {
0568 };
0569 
0570 template<class _impl>
0571 struct PixelWrapper<quint16, _impl> {
0572     using int_v = xsimd::batch<int, _impl>;
0573     using uint_v = xsimd::batch<unsigned int, _impl>;
0574     using float_v = xsimd::batch<float, _impl>;
0575 
0576     static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
0577     static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
0578 
0579     ALWAYS_INLINE
0580     static quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha)
0581     {
0582         return OptiRound<_impl, quint16>::roundScalar((float(b) - a) * alpha + float(a));
0583     }
0584 
0585     ALWAYS_INLINE
0586     static quint16 roundFloatToUint(float x)
0587     {
0588         return OptiRound<_impl, quint16>::roundScalar(x);
0589     }
0590 
0591     ALWAYS_INLINE
0592     static void normalizeAlpha(float &alpha)
0593     {
0594         const float uint16Rec1 = 1.0f / 65535.0f;
0595         alpha *= uint16Rec1;
0596     }
0597 
0598     ALWAYS_INLINE
0599     static void denormalizeAlpha(float &alpha)
0600     {
0601         const float uint16Max = 65535.0f;
0602         alpha *= uint16Max;
0603     }
0604 
0605     PixelWrapper()
0606         : mask(0xFFFF)
0607         , uint16Max(65535.0f)
0608         , uint16Rec1(1.0f / 65535.0f)
0609     {
0610     }
0611 
0612     ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
0613     {
0614         // struct PackedPixel {
0615         //    float rrgg;
0616         //    float bbaa;
0617         // }
0618 #if XSIMD_VERSION_MAJOR < 10
0619         uint_v pixelsC1C2;
0620         uint_v pixelsC3Alpha;
0621         KoRgbaInterleavers<16>::deinterleave(src, pixelsC1C2, pixelsC3Alpha);
0622 #else
0623         const auto *srcPtr = static_cast<const typename uint_v::value_type *>(src);
0624         const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2; // stride == 2
0625         const auto idx2 = idx1 + 1; // offset 1 == 2nd members
0626 
0627         const auto pixelsC1C2 = uint_v::gather(srcPtr, idx1);
0628         const auto pixelsC3Alpha = uint_v::gather(srcPtr, idx2);
0629 #endif
0630 
0631         dst_c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>(pixelsC1C2 & mask)); // r
0632         dst_c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC1C2 >> 16) & mask)); // g
0633         dst_c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha & mask))); // b
0634         dst_alpha = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha >> 16) & mask)); // a
0635 
0636         dst_alpha *= uint16Rec1;
0637     }
0638 
0639     ALWAYS_INLINE void write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
0640     {
0641         const auto alpha = a * uint16Max;
0642 
0643         const auto v1 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c1));
0644         const auto v2 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c2));
0645         const auto v3 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c3));
0646         const auto v4 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(alpha));
0647 
0648         const auto c1c2 = ((v2 & mask) << 16) | (v1 & mask);
0649         const auto c3ca = ((v4 & mask) << 16) | (v3 & mask);
0650 
0651 #if XSIMD_VERSION_MAJOR < 10
0652         KoRgbaInterleavers<16>::interleave(dst, c1c2, c3ca);
0653 #else
0654         auto dstPtr = reinterpret_cast<typename int_v::value_type *>(dst);
0655 
0656         const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2;
0657         const auto idx2 = idx1 + 1;
0658 
0659         c1c2.scatter(dstPtr, idx1);
0660         c3ca.scatter(dstPtr, idx2);
0661 #endif
0662     }
0663 
0664     ALWAYS_INLINE
0665     void clearPixels(quint8 *dataDst)
0666     {
0667         memset(dataDst, 0, float_v::size * sizeof(quint16) * 4);
0668     }
0669 
0670     ALWAYS_INLINE
0671     void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
0672     {
0673         memcpy(dataDst, dataSrc, float_v::size * sizeof(quint16) * 4);
0674     }
0675 
0676     const uint_v mask;
0677     const float_v uint16Max;
0678     const float_v uint16Rec1;
0679 };
0680 
0681 template<typename _impl>
0682 struct PixelWrapper<quint8, _impl> {
0683     using int_v = xsimd::batch<int, _impl>;
0684     using uint_v = xsimd::batch<unsigned int, _impl>;
0685     using float_v = xsimd::batch<float, _impl>;
0686 
0687     static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
0688     static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
0689 
0690     ALWAYS_INLINE
0691     static quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha)
0692     {
0693         return KoStreamedMath<_impl>::lerp_mixed_u8_float(a, b, alpha);
0694     }
0695 
0696     ALWAYS_INLINE
0697     static quint8 roundFloatToUint(float x)
0698     {
0699         return KoStreamedMath<_impl>::round_float_to_u8(x);
0700     }
0701 
0702     ALWAYS_INLINE
0703     static void normalizeAlpha(float &alpha)
0704     {
0705         const float uint8Rec1 = 1.0f / 255.0f;
0706         alpha *= uint8Rec1;
0707     }
0708 
0709     ALWAYS_INLINE
0710     static void denormalizeAlpha(float &alpha)
0711     {
0712         const float uint8Max = 255.0f;
0713         alpha *= uint8Max;
0714     }
0715 
0716     PixelWrapper()
0717         : mask(quint32(0xFF))
0718         , uint8Max(255.0f)
0719         , uint8Rec1(1.0f / 255.0f)
0720     {
0721     }
0722 
0723     ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
0724     {
0725         dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<false>(src);
0726         KoStreamedMath<_impl>::template fetch_colors_32<false>(src, dst_c1, dst_c2, dst_c3);
0727 
0728         dst_alpha *= uint8Rec1;
0729     }
0730 
0731     ALWAYS_INLINE
0732     void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a)
0733     {
0734         const auto alpha = a * uint8Max;
0735 
0736         KoStreamedMath<_impl>::write_channels_32_unaligned(dataDst, alpha, c1, c2, c3);
0737     }
0738 
0739     ALWAYS_INLINE
0740     void clearPixels(quint8 *dataDst)
0741     {
0742         memset(dataDst, 0, float_v::size * sizeof(quint8) * 4);
0743     }
0744 
0745     ALWAYS_INLINE
0746     void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
0747     {
0748         memcpy(dataDst, dataSrc, float_v::size * sizeof(quint8) * 4);
0749     }
0750 
0751     const uint_v mask;
0752     const float_v uint8Max;
0753     const float_v uint8Rec1;
0754 };
0755 
0756 template<typename _impl>
0757 struct PixelWrapper<float, _impl> {
0758     using int_v = xsimd::batch<int, _impl>;
0759     using uint_v = xsimd::batch<unsigned int, _impl>;
0760     using float_v = xsimd::batch<float, _impl>;
0761 
0762     static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!");
0763     static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!");
0764 
0765     struct Pixel {
0766         float red;
0767         float green;
0768         float blue;
0769         float alpha;
0770     };
0771 
0772     ALWAYS_INLINE
0773     static float lerpMixedUintFloat(float a, float b, float alpha)
0774     {
0775         return Arithmetic::lerp(a,b,alpha);
0776     }
0777 
0778     ALWAYS_INLINE
0779     static float roundFloatToUint(float x)
0780     {
0781         return x;
0782     }
0783 
0784     ALWAYS_INLINE
0785     static void normalizeAlpha(float &alpha)
0786     {
0787         Q_UNUSED(alpha);
0788     }
0789 
0790     ALWAYS_INLINE
0791     static void denormalizeAlpha(float &alpha)
0792     {
0793         Q_UNUSED(alpha);
0794     }
0795 
0796     PixelWrapper() = default;
0797 
0798     ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha)
0799     {
0800 #if XSIMD_VERSION_MAJOR < 10
0801         KoRgbaInterleavers<32>::deinterleave(src, dst_c1, dst_c2, dst_c3, dst_alpha);
0802 #else
0803         const auto srcPtr = reinterpret_cast<const typename float_v::value_type *>(src);
0804         const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4
0805         const auto idx2 = idx1 + 1;
0806         const auto idx3 = idx1 + 2;
0807         const auto idx4 = idx1 + 3;
0808 
0809         dst_c1 = float_v::gather(srcPtr, idx1);
0810         dst_c2 = float_v::gather(srcPtr, idx2);
0811         dst_c3 = float_v::gather(srcPtr, idx3);
0812         dst_alpha = float_v::gather(srcPtr, idx4);
0813 #endif
0814     }
0815 
0816     ALWAYS_INLINE void
0817     write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha)
0818     {
0819 #if XSIMD_VERSION_MAJOR < 10
0820         KoRgbaInterleavers<32>::interleave(dst, src_c1, src_c2, src_c3, src_alpha);
0821 #else
0822         auto dstPtr = reinterpret_cast<typename float_v::value_type *>(dst);
0823 
0824         const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4
0825         const auto idx2 = idx1 + 1;
0826         const auto idx3 = idx1 + 2;
0827         const auto idx4 = idx1 + 3;
0828 
0829         src_c1.scatter(dstPtr, idx1);
0830         src_c2.scatter(dstPtr, idx2);
0831         src_c3.scatter(dstPtr, idx3);
0832         src_alpha.scatter(dstPtr, idx4);
0833 #endif
0834     }
0835 
0836     ALWAYS_INLINE
0837     void clearPixels(quint8 *dataDst)
0838     {
0839         memset(dataDst, 0, float_v::size * sizeof(float) * 4);
0840     }
0841 
0842     ALWAYS_INLINE
0843     void copyPixels(const quint8 *dataSrc, quint8 *dataDst)
0844     {
0845         memcpy(dataDst, dataSrc, float_v::size * sizeof(float) * 4);
0846     }
0847 };
0848 
0849 namespace KoStreamedMathFunctions
0850 {
0851 template<int pixelSize>
0852 ALWAYS_INLINE void clearPixel(quint8 *dst)
0853 {
0854     std::memset(dst, 0, pixelSize);
0855 }
0856 
0857 template<int pixelSize>
0858 ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst)
0859 {
0860     std::memcpy(dst, src, pixelSize);
0861 }
0862 } // namespace KoStreamedMathFunctions
0863 
0864 #endif /* __KOSTREAMED_MATH_H */