File indexing completed on 2024-06-09 04:23:31
0001 /* 0002 * SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73@gmail.com> 0003 * SPDX-FileCopyrightText: 2020 Mathias Wein <lynx.mw+kde@gmail.com> 0004 * SPDX-FileCopyrightText: 2022 L. E. Segovia <amy@amyspark.me> 0005 * 0006 * SPDX-License-Identifier: LGPL-2.1-or-later 0007 */ 0008 0009 #ifndef __KOSTREAMED_MATH_H 0010 #define __KOSTREAMED_MATH_H 0011 0012 #if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) 0013 #error "Trying to use SIMD with an unknown architecture!" 0014 #endif 0015 0016 #include <cstdint> 0017 #include <cstring> 0018 #include <iostream> 0019 #include <type_traits> 0020 #include <xsimd_extensions/xsimd.hpp> 0021 0022 #if XSIMD_VERSION_MAJOR < 10 0023 #include <KoRgbaInterleavers.h> 0024 #endif 0025 0026 #include <KoAlwaysInline.h> 0027 #include <KoColorSpaceMaths.h> 0028 #include <KoCompositeOp.h> 0029 0030 #define BLOCKDEBUG 0 0031 0032 template<typename _impl, typename result_type> 0033 struct OptiRound { 0034 ALWAYS_INLINE static result_type roundScalar(const float value) 0035 { 0036 #ifdef __SSE__ 0037 // SSE/AVX instructions use round-to-even rounding rule so we 0038 // should reuse it when possible 0039 return _mm_cvtss_si32(_mm_set_ss(value)); 0040 #elif XSIMD_WITH_NEON64 0041 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vdupq_n_f32(value))), 0042 0); 0043 #elif XSIMD_WITH_NEON 0044 /* origin: 0045 * https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 0046 */ 0047 // Contributors to this work are: 0048 // John W. Ratcliff <jratcliffscarab@gmail.com> 0049 // Brandon Rowlett <browlett@nvidia.com> 0050 // Ken Fast <kfast@gdeb.com> 0051 // Eric van Beurden <evanbeurden@nvidia.com> 0052 // Alexander Potylitsin <apotylitsin@nvidia.com> 0053 // Hasindu Gamaarachchi <hasindu2008@gmail.com> 0054 // Jim Huang <jserv@biilabs.io> 0055 // Mark Cheng <marktwtn@biilabs.io> 0056 // Malcolm James MacLeod <malcolm@gulden.com> 0057 // Devin Hussey (easyaspi314) <husseydevin@gmail.com> 0058 // Sebastian Pop <spop@amazon.com> 0059 // Developer Ecosystem Engineering 0060 // <DeveloperEcosystemEngineering@apple.com> Danila Kutenin 0061 // <danilak@google.com> François Turban (JishinMaster) 0062 // <francois.turban@gmail.com> Pei-Hsuan Hung <afcidk@gmail.com> 0063 // Yang-Hao Yuan <yanghau@biilabs.io> 0064 // Syoyo Fujita <syoyo@lighttransport.com> 0065 // Brecht Van Lommel <brecht@blender.org> 0066 0067 /* 0068 * sse2neon is freely redistributable under the MIT License. 0069 * 0070 * Permission is hereby granted, free of charge, to any person obtaining 0071 * a copy of this software and associated documentation files (the 0072 * "Software"), to deal in the Software without restriction, including 0073 * without limitation the rights to use, copy, modify, merge, publish, 0074 * distribute, sublicense, and/or sell copies of the Software, and to 0075 * permit persons to whom the Software is furnished to do so, subject to 0076 * the following conditions: 0077 * 0078 * The above copyright notice and this permission notice shall be 0079 * included in all copies or substantial portions of the Software. 0080 * 0081 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 0082 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 0083 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 0084 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 0085 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 0086 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 0087 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 0088 * SOFTWARE. 0089 */ 0090 const auto nearbyint_as_int = [](const float v) { 0091 const auto a = vdupq_n_f32(v); 0092 const auto signmask = vdupq_n_u32(0x80000000); 0093 const auto half = 0094 vbslq_f32(signmask, a, vdupq_n_f32(0.5f)); /* +/- 0.5 */ 0095 const auto r_normal = vcvtq_s32_f32( 0096 vaddq_f32(a, half)); /* round to integer: [a + 0.5]*/ 0097 const auto r_trunc = 0098 vcvtq_s32_f32(a); /* truncate to integer: [a] */ 0099 const auto plusone = vreinterpretq_s32_u32( 0100 vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 0101 31)); /* 1 or 0 */ 0102 const auto r_even = 0103 vbicq_s32(vaddq_s32(r_trunc, plusone), 0104 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ 0105 const auto delta = vsubq_f32( 0106 a, 0107 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ 0108 const auto is_delta_half = 0109 vceqq_f32(delta, half); /* delta == +/- 0.5 */ 0110 return vbslq_s32(is_delta_half, r_even, r_normal); 0111 }; 0112 return vgetq_lane_s32(nearbyint_as_int(value), 0); 0113 #else 0114 return std::lroundf(value); 0115 #endif 0116 } 0117 }; 0118 0119 template<typename _impl> 0120 struct OptiDiv { 0121 using float_v = xsimd::batch<float, _impl>; 0122 0123 ALWAYS_INLINE static float divScalar(const float ÷nt, const float &divisor) 0124 { 0125 #ifdef __SSE__ 0126 float result = NAN; 0127 0128 __m128 x = _mm_set_ss(divisor); 0129 __m128 y = _mm_set_ss(divident); 0130 x = _mm_rcp_ss(x); 0131 x = _mm_mul_ss(x, y); 0132 0133 _mm_store_ss(&result, x); 0134 return result; 0135 #elif defined __ARM_NEON 0136 auto x = vdupq_n_f32(divisor); 0137 auto y = vdupq_n_f32(divident); 0138 x = vrecpeq_f32(x); 0139 x = vmulq_f32(x, y); 0140 0141 return vgetq_lane_f32(x, 0); 0142 #else 0143 return (1.f / divisor) * divident; 0144 #endif 0145 } 0146 0147 ALWAYS_INLINE static float_v divVector(const float_v ÷nt, const float_v &divisor) 0148 { 0149 return divident * xsimd::reciprocal(divisor); 0150 } 0151 }; 0152 0153 template<typename _impl> 0154 struct KoStreamedMath { 0155 using int_v = xsimd::batch<int, _impl>; 0156 using uint_v = xsimd::batch<unsigned int, _impl>; 0157 using float_v = xsimd::batch<float, _impl>; 0158 0159 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!"); 0160 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!"); 0161 0162 /** 0163 * Composes src into dst without using vector instructions 0164 */ 0165 template<bool useMask, bool useFlow, class Compositor, int pixelSize> 0166 static void genericComposite_novector(const KoCompositeOp::ParameterInfo ¶ms) 0167 { 0168 using namespace Arithmetic; 0169 0170 const qint32 linearInc = pixelSize; 0171 qint32 srcLinearInc = params.srcRowStride ? pixelSize : 0; 0172 0173 quint8 *dstRowStart = params.dstRowStart; 0174 const quint8 *maskRowStart = params.maskRowStart; 0175 const quint8 *srcRowStart = params.srcRowStart; 0176 typename Compositor::ParamsWrapper paramsWrapper(params); 0177 0178 for (qint32 r = params.rows; r > 0; --r) { 0179 const quint8 *mask = maskRowStart; 0180 const quint8 *src = srcRowStart; 0181 quint8 *dst = dstRowStart; 0182 0183 int blockRest = params.cols; 0184 0185 for (int i = 0; i < blockRest; i++) { 0186 Compositor::template compositeOnePixelScalar<useMask, _impl>(src, 0187 dst, 0188 mask, 0189 params.opacity, 0190 paramsWrapper); 0191 src += srcLinearInc; 0192 dst += linearInc; 0193 0194 if (useMask) { 0195 mask++; 0196 } 0197 } 0198 0199 srcRowStart += params.srcRowStride; 0200 dstRowStart += params.dstRowStride; 0201 0202 if (useMask) { 0203 maskRowStart += params.maskRowStride; 0204 } 0205 } 0206 } 0207 0208 template<bool useMask, bool useFlow, class Compositor> 0209 static void genericComposite32_novector(const KoCompositeOp::ParameterInfo ¶ms) 0210 { 0211 genericComposite_novector<useMask, useFlow, Compositor, 4>(params); 0212 } 0213 0214 template<bool useMask, bool useFlow, class Compositor> 0215 static void genericComposite128_novector(const KoCompositeOp::ParameterInfo ¶ms) 0216 { 0217 genericComposite_novector<useMask, useFlow, Compositor, 16>(params); 0218 } 0219 0220 template<bool useMask, bool useFlow, class Compositor> 0221 static void genericComposite64_novector(const KoCompositeOp::ParameterInfo ¶ms) 0222 { 0223 genericComposite_novector<useMask, useFlow, Compositor, 8>(params); 0224 } 0225 0226 static inline quint8 round_float_to_u8(float x) 0227 { 0228 return OptiRound<_impl, quint8>::roundScalar(x); 0229 } 0230 0231 static inline quint8 lerp_mixed_u8_float(quint8 a, quint8 b, float alpha) 0232 { 0233 return round_float_to_u8(float(b - a) * alpha + float(a)); 0234 } 0235 0236 /** 0237 * Get a vector containing first float_v::size values of mask. 0238 * Each source mask element is considered to be a 8-bit integer 0239 */ 0240 static inline float_v fetch_mask_8(const quint8 *data) 0241 { 0242 return xsimd::batch_cast<float>(xsimd::load_and_extend<int_v>(data)); 0243 } 0244 0245 /** 0246 * Get an alpha values from float_v::size pixels 32-bit each 0247 * (4 channels, 8 bit per channel). The alpha value is considered 0248 * to be stored in the most significant byte of the pixel 0249 * 0250 * \p aligned controls whether the \p data is fetched using aligned 0251 * instruction or not. 0252 * 1) Fetching aligned data with unaligned instruction 0253 * degrades performance. 0254 * 2) Fetching unaligned data with aligned instruction 0255 * causes \#GP (General Protection Exception) 0256 */ 0257 template<bool aligned> 0258 static inline float_v fetch_alpha_32(const void *data) 0259 { 0260 using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type; 0261 const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{}); 0262 return xsimd::to_float(xsimd::bitwise_cast_compat<int>(data_i >> 24)); 0263 } 0264 0265 /** 0266 * Get color values from float_v::size pixels 32-bit each 0267 * (4 channels, 8 bit per channel). The color data is considered 0268 * to be stored in the 3 least significant bytes of the pixel. 0269 * 0270 * \p aligned controls whether the \p data is fetched using aligned 0271 * instruction or not. 0272 * 1) Fetching aligned data with unaligned instruction 0273 * degrades performance. 0274 * 2) Fetching unaligned data with aligned instruction 0275 * causes \#GP (General Protection Exception) 0276 */ 0277 template<bool aligned> 0278 static inline void fetch_colors_32(const void *data, float_v &c1, float_v &c2, float_v &c3) 0279 { 0280 using U = typename std::conditional<aligned, xsimd::aligned_mode, xsimd::unaligned_mode>::type; 0281 0282 const auto data_i = uint_v::load(static_cast<const typename uint_v::value_type *>(data), U{}); 0283 0284 const uint_v mask(0xFF); 0285 0286 c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 16) & mask)); 0287 c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i >> 8) & mask)); 0288 c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((data_i) & mask)); 0289 } 0290 0291 /** 0292 * Pack color and alpha values to float_v::size pixels 32-bit each 0293 * (4 channels, 8 bit per channel). The color data is considered 0294 * to be stored in the 3 least significant bytes of the pixel, alpha - 0295 * in the most significant byte 0296 * 0297 * NOTE: \p data must be aligned pointer! 0298 */ 0299 static inline void 0300 write_channels_32(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3) 0301 { 0302 const int_v mask(0xFF); 0303 0304 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24; 0305 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16; 0306 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8; 0307 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask); 0308 xsimd::store_aligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4)); 0309 } 0310 0311 static inline void 0312 write_channels_32_unaligned(void *data, const float_v alpha, const float_v c1, const float_v c2, const float_v c3) 0313 { 0314 const int_v mask(0xFF); 0315 0316 const auto v1 = (xsimd::nearbyint_as_int(alpha)) << 24; 0317 const auto v2 = (xsimd::nearbyint_as_int(c1) & mask) << 16; 0318 const auto v3 = (xsimd::nearbyint_as_int(c2) & mask) << 8; 0319 const auto v4 = (xsimd::nearbyint_as_int(c3) & mask); 0320 xsimd::store_unaligned(static_cast<typename int_v::value_type *>(data), (v1 | v2) | (v3 | v4)); 0321 } 0322 0323 /** 0324 * Composes src pixels into dst pixels. Is optimized for 32-bit-per-pixel 0325 * colorspaces. Uses \p Compositor strategy parameter for doing actual 0326 * math of the composition 0327 */ 0328 template<bool useMask, bool useFlow, class Compositor, int pixelSize> 0329 static void genericComposite(const KoCompositeOp::ParameterInfo ¶ms) 0330 { 0331 using namespace Arithmetic; 0332 0333 const int vectorSize = static_cast<int>(float_v::size); 0334 const qint32 vectorInc = pixelSize * vectorSize; 0335 const qint32 linearInc = pixelSize; 0336 qint32 srcVectorInc = vectorInc; 0337 qint32 srcLinearInc = pixelSize; 0338 0339 quint8 *dstRowStart = params.dstRowStart; 0340 const quint8 *maskRowStart = params.maskRowStart; 0341 const quint8 *srcRowStart = params.srcRowStart; 0342 typename Compositor::ParamsWrapper paramsWrapper(params); 0343 0344 if (!params.srcRowStride) { 0345 if (pixelSize == 4) { 0346 auto *buf = reinterpret_cast<uint_v *>(xsimd::vector_aligned_malloc<typename uint_v::value_type>(vectorSize)); 0347 *buf = uint_v(*(reinterpret_cast<const quint32 *>(srcRowStart))); 0348 srcRowStart = reinterpret_cast<quint8 *>(buf); 0349 srcLinearInc = 0; 0350 srcVectorInc = 0; 0351 } else { 0352 auto *buf = xsimd::vector_aligned_malloc<quint8>(vectorInc); 0353 quint8 *ptr = buf; 0354 0355 for (size_t i = 0; i < vectorSize; i++) { 0356 memcpy(ptr, params.srcRowStart, pixelSize); 0357 ptr += pixelSize; 0358 } 0359 0360 srcRowStart = buf; 0361 srcLinearInc = 0; 0362 srcVectorInc = 0; 0363 } 0364 } 0365 #if BLOCKDEBUG 0366 int totalBlockAlign = 0; 0367 int totalBlockAlignedVector = 0; 0368 int totalBlockUnalignedVector = 0; 0369 int totalBlockRest = 0; 0370 #endif 0371 0372 for (qint32 r = params.rows; r > 0; --r) { 0373 // Hint: Mask is allowed to be unaligned 0374 const quint8 *mask = maskRowStart; 0375 0376 const quint8 *src = srcRowStart; 0377 quint8 *dst = dstRowStart; 0378 0379 const int pixelsAlignmentMask = vectorSize * sizeof(float) - 1; 0380 auto srcPtrValue = reinterpret_cast<uintptr_t>(src); 0381 auto dstPtrValue = reinterpret_cast<uintptr_t>(dst); 0382 uintptr_t srcAlignment = srcPtrValue & pixelsAlignmentMask; 0383 uintptr_t dstAlignment = dstPtrValue & pixelsAlignmentMask; 0384 0385 // Uncomment if facing problems with alignment: 0386 // Q_ASSERT_X(!(dstAlignment & 3), "Compositing", 0387 // "Pixel data must be aligned on pixels borders!"); 0388 0389 int blockAlign = params.cols; 0390 int blockAlignedVector = 0; 0391 int blockUnalignedVector = 0; 0392 int blockRest = 0; 0393 0394 int *vectorBlock = 0395 srcAlignment == dstAlignment || !srcVectorInc ? &blockAlignedVector : &blockUnalignedVector; 0396 0397 if (!dstAlignment) { 0398 blockAlign = 0; 0399 *vectorBlock = params.cols / vectorSize; 0400 blockRest = params.cols % vectorSize; 0401 } else if (params.cols > 2 * vectorSize) { 0402 blockAlign = (vectorInc - dstAlignment) / pixelSize; 0403 const int restCols = params.cols - blockAlign; 0404 if (restCols > 0) { 0405 *vectorBlock = restCols / vectorSize; 0406 blockRest = restCols % vectorSize; 0407 } else { 0408 blockAlign = params.cols; 0409 *vectorBlock = 0; 0410 blockRest = 0; 0411 } 0412 } 0413 #if BLOCKDEBUG 0414 totalBlockAlign += blockAlign; 0415 totalBlockAlignedVector += blockAlignedVector; 0416 totalBlockUnalignedVector += blockUnalignedVector; 0417 totalBlockRest += blockRest; 0418 #endif 0419 0420 for (int i = 0; i < blockAlign; i++) { 0421 Compositor::template compositeOnePixelScalar<useMask, _impl>(src, 0422 dst, 0423 mask, 0424 params.opacity, 0425 paramsWrapper); 0426 src += srcLinearInc; 0427 dst += linearInc; 0428 0429 if (useMask) { 0430 mask++; 0431 } 0432 } 0433 0434 for (int i = 0; i < blockAlignedVector; i++) { 0435 Compositor::template compositeVector<useMask, true, _impl>(src, 0436 dst, 0437 mask, 0438 params.opacity, 0439 paramsWrapper); 0440 src += srcVectorInc; 0441 dst += vectorInc; 0442 0443 if (useMask) { 0444 mask += vectorSize; 0445 } 0446 } 0447 0448 for (int i = 0; i < blockUnalignedVector; i++) { 0449 Compositor::template compositeVector<useMask, false, _impl>(src, 0450 dst, 0451 mask, 0452 params.opacity, 0453 paramsWrapper); 0454 src += srcVectorInc; 0455 dst += vectorInc; 0456 0457 if (useMask) { 0458 mask += vectorSize; 0459 } 0460 } 0461 0462 for (int i = 0; i < blockRest; i++) { 0463 Compositor::template compositeOnePixelScalar<useMask, _impl>(src, 0464 dst, 0465 mask, 0466 params.opacity, 0467 paramsWrapper); 0468 src += srcLinearInc; 0469 dst += linearInc; 0470 0471 if (useMask) { 0472 mask++; 0473 } 0474 } 0475 0476 srcRowStart += params.srcRowStride; 0477 dstRowStart += params.dstRowStride; 0478 0479 if (useMask) { 0480 maskRowStart += params.maskRowStride; 0481 } 0482 } 0483 0484 #if BLOCKDEBUG 0485 dbgPigment << "I" 0486 << "rows:" << params.rows << "\tpad(S):" << totalBlockAlign << "\tbav(V):" << totalBlockAlignedVector 0487 << "\tbuv(V):" << totalBlockUnalignedVector << "\tres(S)" 0488 << totalBlockRest; // << srcAlignment << dstAlignment; 0489 #endif 0490 0491 if (!params.srcRowStride) { 0492 xsimd::vector_aligned_free(srcRowStart); 0493 } 0494 } 0495 0496 template<bool useMask, bool useFlow, class Compositor> 0497 static void genericComposite32(const KoCompositeOp::ParameterInfo ¶ms) 0498 { 0499 genericComposite<useMask, useFlow, Compositor, 4>(params); 0500 } 0501 0502 template<bool useMask, bool useFlow, class Compositor> 0503 static void genericComposite128(const KoCompositeOp::ParameterInfo ¶ms) 0504 { 0505 genericComposite<useMask, useFlow, Compositor, 16>(params); 0506 } 0507 0508 template<bool useMask, bool useFlow, class Compositor> 0509 static void genericComposite64(const KoCompositeOp::ParameterInfo ¶ms) 0510 { 0511 genericComposite<useMask, useFlow, Compositor, 8>(params); 0512 } 0513 }; 0514 0515 template<typename channels_type, class _impl> 0516 struct PixelStateRecoverHelper { 0517 using float_v = xsimd::batch<float, _impl>; 0518 using float_m = typename float_v::batch_bool_type; 0519 0520 ALWAYS_INLINE 0521 PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3) 0522 { 0523 Q_UNUSED(c1); 0524 Q_UNUSED(c2); 0525 Q_UNUSED(c3); 0526 } 0527 0528 ALWAYS_INLINE 0529 void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const { 0530 Q_UNUSED(mask); 0531 Q_UNUSED(c1); 0532 Q_UNUSED(c2); 0533 Q_UNUSED(c3); 0534 } 0535 }; 0536 0537 template<class _impl> 0538 struct PixelStateRecoverHelper<float, _impl> { 0539 using float_v = xsimd::batch<float, _impl>; 0540 using float_m = typename float_v::batch_bool_type; 0541 0542 ALWAYS_INLINE 0543 PixelStateRecoverHelper(const float_v &c1, const float_v &c2, const float_v &c3) 0544 : m_orig_c1(c1), 0545 m_orig_c2(c2), 0546 m_orig_c3(c3) 0547 { 0548 } 0549 0550 ALWAYS_INLINE 0551 void recoverPixels(const float_m &mask, float_v &c1, float_v &c2, float_v &c3) const { 0552 if (xsimd::any(mask)) { 0553 c1 = xsimd::select(mask, m_orig_c1, c1); 0554 c2 = xsimd::select(mask, m_orig_c2, c2); 0555 c3 = xsimd::select(mask, m_orig_c3, c3); 0556 } 0557 } 0558 0559 private: 0560 const float_v m_orig_c1; 0561 const float_v m_orig_c2; 0562 const float_v m_orig_c3; 0563 }; 0564 0565 template<typename channels_type, class _impl> 0566 struct PixelWrapper 0567 { 0568 }; 0569 0570 template<class _impl> 0571 struct PixelWrapper<quint16, _impl> { 0572 using int_v = xsimd::batch<int, _impl>; 0573 using uint_v = xsimd::batch<unsigned int, _impl>; 0574 using float_v = xsimd::batch<float, _impl>; 0575 0576 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!"); 0577 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!"); 0578 0579 ALWAYS_INLINE 0580 static quint16 lerpMixedUintFloat(quint16 a, quint16 b, float alpha) 0581 { 0582 return OptiRound<_impl, quint16>::roundScalar((float(b) - a) * alpha + float(a)); 0583 } 0584 0585 ALWAYS_INLINE 0586 static quint16 roundFloatToUint(float x) 0587 { 0588 return OptiRound<_impl, quint16>::roundScalar(x); 0589 } 0590 0591 ALWAYS_INLINE 0592 static void normalizeAlpha(float &alpha) 0593 { 0594 const float uint16Rec1 = 1.0f / 65535.0f; 0595 alpha *= uint16Rec1; 0596 } 0597 0598 ALWAYS_INLINE 0599 static void denormalizeAlpha(float &alpha) 0600 { 0601 const float uint16Max = 65535.0f; 0602 alpha *= uint16Max; 0603 } 0604 0605 PixelWrapper() 0606 : mask(0xFFFF) 0607 , uint16Max(65535.0f) 0608 , uint16Rec1(1.0f / 65535.0f) 0609 { 0610 } 0611 0612 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha) 0613 { 0614 // struct PackedPixel { 0615 // float rrgg; 0616 // float bbaa; 0617 // } 0618 #if XSIMD_VERSION_MAJOR < 10 0619 uint_v pixelsC1C2; 0620 uint_v pixelsC3Alpha; 0621 KoRgbaInterleavers<16>::deinterleave(src, pixelsC1C2, pixelsC3Alpha); 0622 #else 0623 const auto *srcPtr = static_cast<const typename uint_v::value_type *>(src); 0624 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2; // stride == 2 0625 const auto idx2 = idx1 + 1; // offset 1 == 2nd members 0626 0627 const auto pixelsC1C2 = uint_v::gather(srcPtr, idx1); 0628 const auto pixelsC3Alpha = uint_v::gather(srcPtr, idx2); 0629 #endif 0630 0631 dst_c1 = xsimd::to_float(xsimd::bitwise_cast_compat<int>(pixelsC1C2 & mask)); // r 0632 dst_c2 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC1C2 >> 16) & mask)); // g 0633 dst_c3 = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha & mask))); // b 0634 dst_alpha = xsimd::to_float(xsimd::bitwise_cast_compat<int>((pixelsC3Alpha >> 16) & mask)); // a 0635 0636 dst_alpha *= uint16Rec1; 0637 } 0638 0639 ALWAYS_INLINE void write(void *dst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a) 0640 { 0641 const auto alpha = a * uint16Max; 0642 0643 const auto v1 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c1)); 0644 const auto v2 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c2)); 0645 const auto v3 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(c3)); 0646 const auto v4 = xsimd::bitwise_cast_compat<unsigned int>(xsimd::nearbyint_as_int(alpha)); 0647 0648 const auto c1c2 = ((v2 & mask) << 16) | (v1 & mask); 0649 const auto c3ca = ((v4 & mask) << 16) | (v3 & mask); 0650 0651 #if XSIMD_VERSION_MAJOR < 10 0652 KoRgbaInterleavers<16>::interleave(dst, c1c2, c3ca); 0653 #else 0654 auto dstPtr = reinterpret_cast<typename int_v::value_type *>(dst); 0655 0656 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 2; 0657 const auto idx2 = idx1 + 1; 0658 0659 c1c2.scatter(dstPtr, idx1); 0660 c3ca.scatter(dstPtr, idx2); 0661 #endif 0662 } 0663 0664 ALWAYS_INLINE 0665 void clearPixels(quint8 *dataDst) 0666 { 0667 memset(dataDst, 0, float_v::size * sizeof(quint16) * 4); 0668 } 0669 0670 ALWAYS_INLINE 0671 void copyPixels(const quint8 *dataSrc, quint8 *dataDst) 0672 { 0673 memcpy(dataDst, dataSrc, float_v::size * sizeof(quint16) * 4); 0674 } 0675 0676 const uint_v mask; 0677 const float_v uint16Max; 0678 const float_v uint16Rec1; 0679 }; 0680 0681 template<typename _impl> 0682 struct PixelWrapper<quint8, _impl> { 0683 using int_v = xsimd::batch<int, _impl>; 0684 using uint_v = xsimd::batch<unsigned int, _impl>; 0685 using float_v = xsimd::batch<float, _impl>; 0686 0687 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!"); 0688 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!"); 0689 0690 ALWAYS_INLINE 0691 static quint8 lerpMixedUintFloat(quint8 a, quint8 b, float alpha) 0692 { 0693 return KoStreamedMath<_impl>::lerp_mixed_u8_float(a, b, alpha); 0694 } 0695 0696 ALWAYS_INLINE 0697 static quint8 roundFloatToUint(float x) 0698 { 0699 return KoStreamedMath<_impl>::round_float_to_u8(x); 0700 } 0701 0702 ALWAYS_INLINE 0703 static void normalizeAlpha(float &alpha) 0704 { 0705 const float uint8Rec1 = 1.0f / 255.0f; 0706 alpha *= uint8Rec1; 0707 } 0708 0709 ALWAYS_INLINE 0710 static void denormalizeAlpha(float &alpha) 0711 { 0712 const float uint8Max = 255.0f; 0713 alpha *= uint8Max; 0714 } 0715 0716 PixelWrapper() 0717 : mask(quint32(0xFF)) 0718 , uint8Max(255.0f) 0719 , uint8Rec1(1.0f / 255.0f) 0720 { 0721 } 0722 0723 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha) 0724 { 0725 dst_alpha = KoStreamedMath<_impl>::template fetch_alpha_32<false>(src); 0726 KoStreamedMath<_impl>::template fetch_colors_32<false>(src, dst_c1, dst_c2, dst_c3); 0727 0728 dst_alpha *= uint8Rec1; 0729 } 0730 0731 ALWAYS_INLINE 0732 void write(quint8 *dataDst, const float_v &c1, const float_v &c2, const float_v &c3, const float_v &a) 0733 { 0734 const auto alpha = a * uint8Max; 0735 0736 KoStreamedMath<_impl>::write_channels_32_unaligned(dataDst, alpha, c1, c2, c3); 0737 } 0738 0739 ALWAYS_INLINE 0740 void clearPixels(quint8 *dataDst) 0741 { 0742 memset(dataDst, 0, float_v::size * sizeof(quint8) * 4); 0743 } 0744 0745 ALWAYS_INLINE 0746 void copyPixels(const quint8 *dataSrc, quint8 *dataDst) 0747 { 0748 memcpy(dataDst, dataSrc, float_v::size * sizeof(quint8) * 4); 0749 } 0750 0751 const uint_v mask; 0752 const float_v uint8Max; 0753 const float_v uint8Rec1; 0754 }; 0755 0756 template<typename _impl> 0757 struct PixelWrapper<float, _impl> { 0758 using int_v = xsimd::batch<int, _impl>; 0759 using uint_v = xsimd::batch<unsigned int, _impl>; 0760 using float_v = xsimd::batch<float, _impl>; 0761 0762 static_assert(int_v::size == uint_v::size, "the selected architecture does not guarantee vector size equality!"); 0763 static_assert(uint_v::size == float_v::size, "the selected architecture does not guarantee vector size equality!"); 0764 0765 struct Pixel { 0766 float red; 0767 float green; 0768 float blue; 0769 float alpha; 0770 }; 0771 0772 ALWAYS_INLINE 0773 static float lerpMixedUintFloat(float a, float b, float alpha) 0774 { 0775 return Arithmetic::lerp(a,b,alpha); 0776 } 0777 0778 ALWAYS_INLINE 0779 static float roundFloatToUint(float x) 0780 { 0781 return x; 0782 } 0783 0784 ALWAYS_INLINE 0785 static void normalizeAlpha(float &alpha) 0786 { 0787 Q_UNUSED(alpha); 0788 } 0789 0790 ALWAYS_INLINE 0791 static void denormalizeAlpha(float &alpha) 0792 { 0793 Q_UNUSED(alpha); 0794 } 0795 0796 PixelWrapper() = default; 0797 0798 ALWAYS_INLINE void read(const void *src, float_v &dst_c1, float_v &dst_c2, float_v &dst_c3, float_v &dst_alpha) 0799 { 0800 #if XSIMD_VERSION_MAJOR < 10 0801 KoRgbaInterleavers<32>::deinterleave(src, dst_c1, dst_c2, dst_c3, dst_alpha); 0802 #else 0803 const auto srcPtr = reinterpret_cast<const typename float_v::value_type *>(src); 0804 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4 0805 const auto idx2 = idx1 + 1; 0806 const auto idx3 = idx1 + 2; 0807 const auto idx4 = idx1 + 3; 0808 0809 dst_c1 = float_v::gather(srcPtr, idx1); 0810 dst_c2 = float_v::gather(srcPtr, idx2); 0811 dst_c3 = float_v::gather(srcPtr, idx3); 0812 dst_alpha = float_v::gather(srcPtr, idx4); 0813 #endif 0814 } 0815 0816 ALWAYS_INLINE void 0817 write(void *dst, const float_v &src_c1, const float_v &src_c2, const float_v &src_c3, const float_v &src_alpha) 0818 { 0819 #if XSIMD_VERSION_MAJOR < 10 0820 KoRgbaInterleavers<32>::interleave(dst, src_c1, src_c2, src_c3, src_alpha); 0821 #else 0822 auto dstPtr = reinterpret_cast<typename float_v::value_type *>(dst); 0823 0824 const auto idx1 = xsimd::detail::make_sequence_as_batch<int_v>() * 4; // stride == 4 0825 const auto idx2 = idx1 + 1; 0826 const auto idx3 = idx1 + 2; 0827 const auto idx4 = idx1 + 3; 0828 0829 src_c1.scatter(dstPtr, idx1); 0830 src_c2.scatter(dstPtr, idx2); 0831 src_c3.scatter(dstPtr, idx3); 0832 src_alpha.scatter(dstPtr, idx4); 0833 #endif 0834 } 0835 0836 ALWAYS_INLINE 0837 void clearPixels(quint8 *dataDst) 0838 { 0839 memset(dataDst, 0, float_v::size * sizeof(float) * 4); 0840 } 0841 0842 ALWAYS_INLINE 0843 void copyPixels(const quint8 *dataSrc, quint8 *dataDst) 0844 { 0845 memcpy(dataDst, dataSrc, float_v::size * sizeof(float) * 4); 0846 } 0847 }; 0848 0849 namespace KoStreamedMathFunctions 0850 { 0851 template<int pixelSize> 0852 ALWAYS_INLINE void clearPixel(quint8 *dst) 0853 { 0854 std::memset(dst, 0, pixelSize); 0855 } 0856 0857 template<int pixelSize> 0858 ALWAYS_INLINE void copyPixel(const quint8 *src, quint8 *dst) 0859 { 0860 std::memcpy(dst, src, pixelSize); 0861 } 0862 } // namespace KoStreamedMathFunctions 0863 0864 #endif /* __KOSTREAMED_MATH_H */