File indexing completed on 2024-05-12 15:59:37
0001 /* 0002 * SPDX-FileCopyrightText: 2021 Dmitry Kazakov <dimula73@gmail.com> 0003 * 0004 * SPDX-License-Identifier: GPL-2.0-or-later 0005 */ 0006 0007 #ifndef KoOptimizedPixelDataScalerU8ToU16_H 0008 #define KoOptimizedPixelDataScalerU8ToU16_H 0009 0010 #include "KoOptimizedPixelDataScalerU8ToU16Base.h" 0011 0012 #include "KoMultiArchBuildSupport.h" 0013 #include "kis_debug.h" 0014 0015 #include <xsimd_extensions/xsimd.hpp> 0016 0017 template<typename _impl = xsimd::current_arch> 0018 class KoOptimizedPixelDataScalerU8ToU16 : public KoOptimizedPixelDataScalerU8ToU16Base 0019 { 0020 public: 0021 KoOptimizedPixelDataScalerU8ToU16(int channelsPerPixel) 0022 : KoOptimizedPixelDataScalerU8ToU16Base(channelsPerPixel) 0023 { 0024 } 0025 0026 void convertU8ToU16(const quint8 *src, int srcRowStride, quint8 *dst, int dstRowStride, int numRows, int numColumns) const override 0027 { 0028 const int numColorChannels = m_channelsPerPixel * numColumns; 0029 0030 #if defined(HAVE_XSIMD) && XSIMD_WITH_AVX2 0031 using uint16_avx_v = xsimd::batch<uint16_t, xsimd::default_arch>; 0032 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>; 0033 using uint8_v = xsimd::batch<uint8_t, xsimd::sse4_1>; 0034 0035 const int channelsPerAvx2Block = 16; 0036 const int channelsPerSse2Block = 8; 0037 const int avx2Block = numColorChannels / channelsPerAvx2Block; 0038 const int rest = numColorChannels % channelsPerAvx2Block; 0039 const int sse2Block = rest / channelsPerSse2Block; 0040 const int scalarBlock = rest % channelsPerSse2Block; 0041 #elif defined(HAVE_XSIMD) && (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64) 0042 #if XSIMD_WITH_SSE4_1 0043 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>; 0044 using uint8_v = xsimd::batch<uint8_t, xsimd::sse4_1>; 0045 #elif XSIMD_WITH_NEON64 0046 using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>; 0047 using uint8_v = xsimd::batch<uint8_t, xsimd::neon64>; 0048 #else 0049 using uint16_v = xsimd::batch<uint16_t, xsimd::neon>; 0050 using uint8_v = xsimd::batch<uint8_t, xsimd::neon>; 0051 #endif 0052 0053 const int channelsPerSse2Block = 8; 0054 const int avx2Block = 0; 0055 const int sse2Block = numColorChannels / channelsPerSse2Block; 0056 const int scalarBlock = numColorChannels % channelsPerSse2Block; 0057 #else 0058 const int avx2Block = 0; 0059 const int sse2Block = 0; 0060 const int scalarBlock = numColorChannels; 0061 #endif 0062 0063 // qWarning() << ppVar(avx2Block) << ppVar(sse2Block); 0064 0065 for (int row = 0; row < numRows; row++) { 0066 const quint8 *srcPtr = src; 0067 auto *dstPtr = reinterpret_cast<quint16 *>(dst); 0068 0069 #if defined(HAVE_XSIMD) && XSIMD_WITH_AVX2 0070 for (int i = 0; i < avx2Block; i++) { 0071 const auto x = uint8_v::load_unaligned(srcPtr); 0072 0073 uint16_avx_v y(_mm256_cvtepu8_epi16(x)); 0074 const auto y_shifted = y << 8; 0075 y |= y_shifted; 0076 0077 y.store_unaligned( 0078 reinterpret_cast<typename uint16_avx_v::value_type *>(dstPtr)); 0079 0080 srcPtr += channelsPerAvx2Block; 0081 dstPtr += channelsPerAvx2Block; 0082 } 0083 #else 0084 Q_UNUSED(avx2Block); 0085 #endif 0086 0087 #if defined(HAVE_XSIMD) && (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64) 0088 for (int i = 0; i < sse2Block; i++) { 0089 #if XSIMD_WITH_SSE4_1 0090 const uint8_v x(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(srcPtr))); 0091 #else 0092 const uint8_v x(vreinterpretq_u8_u32(vcombine_u32( 0093 vld1_u32(reinterpret_cast<const uint32_t *>(srcPtr)), 0094 vcreate_u32(0)))); 0095 #endif 0096 #if XSIMD_WITH_SSE4_1 0097 uint16_v y(_mm_cvtepu8_epi16(x.data)); 0098 #else 0099 uint16_v y(vmovl_u8(vget_low_u8(x.data))); 0100 #endif 0101 const auto y_shifted = y << 8; 0102 y |= y_shifted; 0103 0104 y.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr)); 0105 0106 srcPtr += channelsPerSse2Block; 0107 dstPtr += channelsPerSse2Block; 0108 } 0109 #else 0110 Q_UNUSED(sse2Block); 0111 #endif 0112 0113 for (int i = 0; i < scalarBlock; i++) { 0114 const quint16 value = *srcPtr; 0115 0116 *dstPtr = static_cast<quint16>(value | (value << 8)); 0117 0118 srcPtr++; 0119 dstPtr++; 0120 } 0121 0122 src += srcRowStride; 0123 dst += dstRowStride; 0124 } 0125 } 0126 0127 void convertU16ToU8(const quint8 *src, int srcRowStride, quint8 *dst, int dstRowStride, int numRows, int numColumns) const override 0128 { 0129 const int numColorChannels = m_channelsPerPixel * numColumns; 0130 0131 #if defined(HAVE_XSIMD) && XSIMD_WITH_AVX2 0132 using uint16_avx_v = xsimd::batch<uint16_t, xsimd::default_arch>; 0133 using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>; 0134 0135 const int channelsPerAvx2Block = 32; 0136 const int channelsPerSse2Block = 16; 0137 const int avx2Block = numColorChannels / channelsPerAvx2Block; 0138 const int rest = numColorChannels % channelsPerAvx2Block; 0139 const int sse2Block = rest / channelsPerSse2Block; 0140 const int scalarBlock = rest % channelsPerSse2Block; 0141 0142 const auto offset1 = uint16_avx_v(128); 0143 const auto offset2 = uint16_v(128); 0144 0145 #elif defined(HAVE_XSIMD) && XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64 0146 // SSE2, unlike the previous function, is a perfectly valid option 0147 // while under generic. 0148 #if XSIMD_WITH_SSE2 0149 using uint16_v = xsimd::batch<uint16_t, xsimd::sse2>; 0150 #elif XSIMD_WITH_NEON64 0151 using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>; 0152 #else 0153 using uint16_v = xsimd::batch<uint16_t, xsimd::neon>; 0154 #endif 0155 0156 const int channelsPerSse2Block = 16; 0157 const int avx2Block = 0; 0158 const int sse2Block = numColorChannels / channelsPerSse2Block; 0159 const int scalarBlock = numColorChannels % channelsPerSse2Block; 0160 0161 const auto offset2 = uint16_v(128); 0162 #else 0163 const int avx2Block = 0; 0164 const int sse2Block = 0; 0165 const int scalarBlock = numColorChannels; 0166 #endif 0167 0168 // qWarning() << ppVar(avx2Block) << ppVar(sse2Block); 0169 0170 for (int row = 0; row < numRows; row++) { 0171 const quint16 *srcPtr = reinterpret_cast<const quint16 *>(src); 0172 quint8 *dstPtr = dst; 0173 0174 #if defined(HAVE_XSIMD) && XSIMD_WITH_AVX2 0175 for (int i = 0; i < avx2Block; i++) { 0176 auto x1 = uint16_avx_v::load_unaligned(srcPtr); 0177 auto x2 = uint16_avx_v::load_unaligned(srcPtr + uint16_avx_v::size); 0178 0179 const auto x1_shifted = x1 >> 8; 0180 const auto x2_shifted = x2 >> 8; 0181 0182 x1 -= x1_shifted; 0183 x1 += offset1; 0184 x1 >>= 8; 0185 0186 x2 -= x2_shifted; 0187 x2 += offset1; 0188 x2 >>= 8; 0189 0190 x1.data = _mm256_packus_epi16(x1, x2); 0191 0192 // Packing in AVX2 does a bit different thing, not 0193 // what you expect that after seeing a SSE2 version :) 0194 // Therefore we need to permute the result... 0195 x1.data = _mm256_permute4x64_epi64(x1, 0xd8); 0196 0197 x1.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr)); 0198 0199 srcPtr += channelsPerAvx2Block; 0200 dstPtr += channelsPerAvx2Block; 0201 } 0202 #else 0203 Q_UNUSED(avx2Block); 0204 #endif 0205 0206 #if defined(HAVE_XSIMD) && (XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64) 0207 for (int i = 0; i < sse2Block; i++) { 0208 auto x1 = uint16_v::load_unaligned(srcPtr); 0209 auto x2 = uint16_v::load_unaligned(srcPtr + uint16_v::size); 0210 0211 const uint16_v x1_shifted = x1 >> 8; 0212 const uint16_v x2_shifted = x2 >> 8; 0213 0214 x1 -= x1_shifted; 0215 x1 += offset2; 0216 x1 >>= 8; 0217 0218 x2 -= x2_shifted; 0219 x2 += offset2; 0220 x2 >>= 8; 0221 #if XSIMD_WITH_SSE2 0222 x1.data = _mm_packus_epi16(x1, x2); 0223 #else 0224 x1.data = vreinterpretq_u16_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(x1)), vqmovun_s16(vreinterpretq_s16_u16(x2)))); 0225 #endif 0226 x1.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr)); 0227 srcPtr += channelsPerSse2Block; 0228 dstPtr += channelsPerSse2Block; 0229 } 0230 #else 0231 Q_UNUSED(sse2Block); 0232 #endif 0233 0234 for (int i = 0; i < scalarBlock; i++) { 0235 const quint16 value = *srcPtr; 0236 0237 *dstPtr = (value - (value >> 8) + 128) >> 8; 0238 0239 srcPtr++; 0240 dstPtr++; 0241 } 0242 0243 src += srcRowStride; 0244 dst += dstRowStride; 0245 } 0246 } 0247 }; 0248 0249 #endif // KoOptimizedPixelDataScalerU8ToU16_H