File indexing completed on 2024-05-12 15:59:37

0001 /*
0002  *  SPDX-FileCopyrightText: 2021 Dmitry Kazakov <dimula73@gmail.com>
0003  *
0004  *  SPDX-License-Identifier: GPL-2.0-or-later
0005  */
0006 
0007 #ifndef KoOptimizedPixelDataScalerU8ToU16_H
0008 #define KoOptimizedPixelDataScalerU8ToU16_H
0009 
0010 #include "KoOptimizedPixelDataScalerU8ToU16Base.h"
0011 
0012 #include "KoMultiArchBuildSupport.h"
0013 #include "kis_debug.h"
0014 
0015 #include <xsimd_extensions/xsimd.hpp>
0016 
0017 template<typename _impl = xsimd::current_arch>
0018 class KoOptimizedPixelDataScalerU8ToU16 : public KoOptimizedPixelDataScalerU8ToU16Base
0019 {
0020 public:
0021     KoOptimizedPixelDataScalerU8ToU16(int channelsPerPixel)
0022         : KoOptimizedPixelDataScalerU8ToU16Base(channelsPerPixel)
0023     {
0024     }
0025 
0026     void convertU8ToU16(const quint8 *src, int srcRowStride, quint8 *dst, int dstRowStride, int numRows, int numColumns) const override
0027     {
0028         const int numColorChannels = m_channelsPerPixel * numColumns;
0029 
0030 #if defined(HAVE_XSIMD) && XSIMD_WITH_AVX2
0031         using uint16_avx_v = xsimd::batch<uint16_t, xsimd::default_arch>;
0032         using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
0033         using uint8_v = xsimd::batch<uint8_t, xsimd::sse4_1>;
0034 
0035         const int channelsPerAvx2Block = 16;
0036         const int channelsPerSse2Block = 8;
0037         const int avx2Block = numColorChannels / channelsPerAvx2Block;
0038         const int rest = numColorChannels % channelsPerAvx2Block;
0039         const int sse2Block = rest / channelsPerSse2Block;
0040         const int scalarBlock = rest % channelsPerSse2Block;
0041 #elif defined(HAVE_XSIMD) && (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
0042 #if XSIMD_WITH_SSE4_1
0043         using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
0044         using uint8_v = xsimd::batch<uint8_t, xsimd::sse4_1>;
0045 #elif XSIMD_WITH_NEON64
0046         using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>;
0047         using uint8_v = xsimd::batch<uint8_t, xsimd::neon64>;
0048 #else
0049         using uint16_v = xsimd::batch<uint16_t, xsimd::neon>;
0050         using uint8_v = xsimd::batch<uint8_t, xsimd::neon>;
0051 #endif
0052 
0053         const int channelsPerSse2Block = 8;
0054         const int avx2Block = 0;
0055         const int sse2Block = numColorChannels / channelsPerSse2Block;
0056         const int scalarBlock = numColorChannels % channelsPerSse2Block;
0057 #else
0058         const int avx2Block = 0;
0059         const int sse2Block = 0;
0060         const int scalarBlock = numColorChannels;
0061 #endif
0062 
0063         // qWarning() << ppVar(avx2Block) << ppVar(sse2Block);
0064 
0065         for (int row = 0; row < numRows; row++) {
0066             const quint8 *srcPtr = src;
0067             auto *dstPtr = reinterpret_cast<quint16 *>(dst);
0068 
0069 #if defined(HAVE_XSIMD) && XSIMD_WITH_AVX2
0070             for (int i = 0; i < avx2Block; i++) {
0071                 const auto x = uint8_v::load_unaligned(srcPtr);
0072 
0073                 uint16_avx_v y(_mm256_cvtepu8_epi16(x));
0074                 const auto y_shifted = y << 8;
0075                 y |= y_shifted;
0076 
0077                 y.store_unaligned(
0078                     reinterpret_cast<typename uint16_avx_v::value_type *>(dstPtr));
0079 
0080                 srcPtr += channelsPerAvx2Block;
0081                 dstPtr += channelsPerAvx2Block;
0082             }
0083 #else
0084             Q_UNUSED(avx2Block);
0085 #endif
0086 
0087 #if defined(HAVE_XSIMD) && (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
0088             for (int i = 0; i < sse2Block; i++) {
0089 #if XSIMD_WITH_SSE4_1
0090                 const uint8_v x(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(srcPtr)));
0091 #else
0092                 const uint8_v x(vreinterpretq_u8_u32(vcombine_u32(
0093                     vld1_u32(reinterpret_cast<const uint32_t *>(srcPtr)),
0094                     vcreate_u32(0))));
0095 #endif
0096 #if XSIMD_WITH_SSE4_1
0097                 uint16_v y(_mm_cvtepu8_epi16(x.data));
0098 #else
0099                 uint16_v y(vmovl_u8(vget_low_u8(x.data)));
0100 #endif
0101                 const auto y_shifted = y << 8;
0102                 y |= y_shifted;
0103 
0104                 y.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr));
0105 
0106                 srcPtr += channelsPerSse2Block;
0107                 dstPtr += channelsPerSse2Block;
0108             }
0109 #else
0110             Q_UNUSED(sse2Block);
0111 #endif
0112 
0113             for (int i = 0; i < scalarBlock; i++) {
0114                 const quint16 value = *srcPtr;
0115 
0116                 *dstPtr = static_cast<quint16>(value | (value << 8));
0117 
0118                 srcPtr++;
0119                 dstPtr++;
0120             }
0121 
0122             src += srcRowStride;
0123             dst += dstRowStride;
0124         }
0125     }
0126 
0127     void convertU16ToU8(const quint8 *src, int srcRowStride, quint8 *dst, int dstRowStride, int numRows, int numColumns) const override
0128     {
0129         const int numColorChannels = m_channelsPerPixel * numColumns;
0130 
0131 #if defined(HAVE_XSIMD) && XSIMD_WITH_AVX2
0132         using uint16_avx_v = xsimd::batch<uint16_t, xsimd::default_arch>;
0133         using uint16_v = xsimd::batch<uint16_t, xsimd::sse4_1>;
0134 
0135         const int channelsPerAvx2Block = 32;
0136         const int channelsPerSse2Block = 16;
0137         const int avx2Block = numColorChannels / channelsPerAvx2Block;
0138         const int rest = numColorChannels % channelsPerAvx2Block;
0139         const int sse2Block = rest / channelsPerSse2Block;
0140         const int scalarBlock = rest % channelsPerSse2Block;
0141 
0142         const auto offset1 = uint16_avx_v(128);
0143         const auto offset2 = uint16_v(128);
0144 
0145 #elif defined(HAVE_XSIMD) && XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64
0146         // SSE2, unlike the previous function, is a perfectly valid option
0147         // while under generic.
0148 #if XSIMD_WITH_SSE2
0149         using uint16_v = xsimd::batch<uint16_t, xsimd::sse2>;
0150 #elif XSIMD_WITH_NEON64
0151         using uint16_v = xsimd::batch<uint16_t, xsimd::neon64>;
0152 #else
0153         using uint16_v = xsimd::batch<uint16_t, xsimd::neon>;
0154 #endif
0155 
0156         const int channelsPerSse2Block = 16;
0157         const int avx2Block = 0;
0158         const int sse2Block = numColorChannels / channelsPerSse2Block;
0159         const int scalarBlock = numColorChannels % channelsPerSse2Block;
0160 
0161         const auto offset2 = uint16_v(128);
0162 #else
0163         const int avx2Block = 0;
0164         const int sse2Block = 0;
0165         const int scalarBlock = numColorChannels;
0166 #endif
0167 
0168         // qWarning() << ppVar(avx2Block) << ppVar(sse2Block);
0169 
0170         for (int row = 0; row < numRows; row++) {
0171             const quint16 *srcPtr = reinterpret_cast<const quint16 *>(src);
0172             quint8 *dstPtr = dst;
0173 
0174 #if defined(HAVE_XSIMD) && XSIMD_WITH_AVX2
0175             for (int i = 0; i < avx2Block; i++) {
0176                 auto x1 = uint16_avx_v::load_unaligned(srcPtr);
0177                 auto x2 = uint16_avx_v::load_unaligned(srcPtr + uint16_avx_v::size);
0178 
0179                 const auto x1_shifted = x1 >> 8;
0180                 const auto x2_shifted = x2 >> 8;
0181 
0182                 x1 -= x1_shifted;
0183                 x1 += offset1;
0184                 x1 >>= 8;
0185 
0186                 x2 -= x2_shifted;
0187                 x2 += offset1;
0188                 x2 >>= 8;
0189 
0190                 x1.data = _mm256_packus_epi16(x1, x2);
0191 
0192                 // Packing in AVX2 does a bit different thing, not
0193                 // what you expect that after seeing a SSE2 version :)
0194                 // Therefore we need to permute the result...
0195                 x1.data = _mm256_permute4x64_epi64(x1, 0xd8);
0196 
0197                 x1.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr));
0198 
0199                 srcPtr += channelsPerAvx2Block;
0200                 dstPtr += channelsPerAvx2Block;
0201             }
0202 #else
0203         Q_UNUSED(avx2Block);
0204 #endif
0205 
0206 #if defined(HAVE_XSIMD) && (XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64)
0207             for (int i = 0; i < sse2Block; i++) {
0208                 auto x1 = uint16_v::load_unaligned(srcPtr);
0209                 auto x2 = uint16_v::load_unaligned(srcPtr + uint16_v::size);
0210 
0211                 const uint16_v x1_shifted = x1 >> 8;
0212                 const uint16_v x2_shifted = x2 >> 8;
0213 
0214                 x1 -= x1_shifted;
0215                 x1 += offset2;
0216                 x1 >>= 8;
0217 
0218                 x2 -= x2_shifted;
0219                 x2 += offset2;
0220                 x2 >>= 8;
0221 #if XSIMD_WITH_SSE2
0222                 x1.data = _mm_packus_epi16(x1, x2);
0223 #else
0224                 x1.data = vreinterpretq_u16_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_u16(x1)), vqmovun_s16(vreinterpretq_s16_u16(x2))));
0225 #endif
0226                 x1.store_unaligned(reinterpret_cast<typename uint16_v::value_type *>(dstPtr));
0227                 srcPtr += channelsPerSse2Block;
0228                 dstPtr += channelsPerSse2Block;
0229             }
0230 #else
0231         Q_UNUSED(sse2Block);
0232 #endif
0233 
0234             for (int i = 0; i < scalarBlock; i++) {
0235                 const quint16 value = *srcPtr;
0236 
0237                 *dstPtr = (value - (value >> 8) + 128) >> 8;
0238 
0239                 srcPtr++;
0240                 dstPtr++;
0241             }
0242 
0243             src += srcRowStride;
0244             dst += dstRowStride;
0245         }
0246     }
0247 };
0248 
0249 #endif // KoOptimizedPixelDataScalerU8ToU16_H