File indexing completed on 2024-11-10 04:00:28

0001 /*
0002  *  SPDX-FileCopyrightText: 2012 Dmitry Kazakov <dimula73@gmail.com>
0003  *  SPDX-FileCopyrightText: 2015 Thorsten Zachmann <zachmann@kde.org>
0004  *  SPDX-FileCopyrightText: 2020 Mathias Wein <lynx.mw+kde@gmail.com>
0005  *  SPDX-FileCopyrightText: 2022 L. E. Segovia <amy@amyspark.me>
0006  *
0007  *  SPDX-License-Identifier: GPL-2.0-or-later
0008  */
0009 
0010 // for calculation of the needed alignment
0011 #include <xsimd_extensions/xsimd.hpp>
0012 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0013 #include <KoOptimizedCompositeOpOver32.h>
0014 #include <KoOptimizedCompositeOpOver128.h>
0015 #include <KoOptimizedCompositeOpCopy128.h>
0016 #include <KoOptimizedCompositeOpAlphaDarken32.h>
0017 #endif
0018 
0019 #include "kis_composition_benchmark.h"
0020 #include <simpletest.h>
0021 #include <QElapsedTimer>
0022 
0023 #include <KoColorSpace.h>
0024 #include <KoCompositeOp.h>
0025 #include <KoColorSpaceRegistry.h>
0026 
0027 #include <KoColorSpaceTraits.h>
0028 #include <KoCompositeOpAlphaDarken.h>
0029 #include <KoCompositeOpOver.h>
0030 #include <KoCompositeOpCopy2.h>
0031 #include <KoOptimizedCompositeOpFactory.h>
0032 #include <KoAlphaDarkenParamsWrapper.h>
0033 
0034 // for posix_memalign()
0035 #include <stdlib.h>
0036 
0037 #include <kis_debug.h>
0038 
0039 #if defined Q_OS_WIN
0040 #define MEMALIGN_ALLOC(p, a, s) ((*(p)) = _aligned_malloc((s), (a)), *(p) ? 0 : errno)
0041 #define MEMALIGN_FREE(p) _aligned_free((p))
0042 #else
0043 #define MEMALIGN_ALLOC(p, a, s) posix_memalign((p), (a), (s))
0044 #define MEMALIGN_FREE(p) free((p))
0045 #endif
0046 
0047 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0048 using float_v = xsimd::batch<float, xsimd::current_arch>;
0049 #endif
0050 
0051 enum AlphaRange {
0052     ALPHA_ZERO,
0053     ALPHA_UNIT,
0054     ALPHA_RANDOM
0055 };
0056 
0057 
0058 template <typename channel_type, class RandomGenerator>
0059 inline channel_type generateAlphaValue(AlphaRange range, RandomGenerator &rnd) {
0060     channel_type value = 0;
0061 
0062     switch (range) {
0063     case ALPHA_ZERO:
0064         break;
0065     case ALPHA_UNIT:
0066         value = rnd.unit();
0067         break;
0068     case ALPHA_RANDOM:
0069         value = rnd();
0070         break;
0071     }
0072 
0073     return value;
0074 }
0075 
0076 #include <boost/random/mersenne_twister.hpp>
0077 #include <boost/random/uniform_smallint.hpp>
0078 #include <boost/random/uniform_real.hpp>
0079 
0080 template <typename channel_type>
0081 struct RandomGenerator {
0082     channel_type operator() () {
0083         qFatal("Wrong template instantiation");
0084         return channel_type(0);
0085     }
0086 
0087     channel_type unit() {
0088         qFatal("Wrong template instantiation");
0089         return channel_type(0);
0090     }
0091 };
0092 
0093 template <>
0094 struct RandomGenerator<quint8>
0095 {
0096     RandomGenerator(int seed)
0097         : m_smallint(0,255),
0098           m_rnd(seed)
0099     {
0100     }
0101 
0102     quint8 operator() () {
0103         return m_smallint(m_rnd);
0104     }
0105 
0106     quint8 unit() {
0107         return KoColorSpaceMathsTraits<quint8>::unitValue;
0108     }
0109 
0110     boost::uniform_smallint<int> m_smallint;
0111     boost::mt11213b m_rnd;
0112 };
0113 
0114 template <>
0115 struct RandomGenerator<quint16>
0116 {
0117     RandomGenerator(int seed)
0118         : m_smallint(0,65535),
0119           m_rnd(seed)
0120     {
0121     }
0122 
0123     quint16 operator() () {
0124         return m_smallint(m_rnd);
0125     }
0126 
0127     quint16 unit() {
0128         return KoColorSpaceMathsTraits<quint16>::unitValue;
0129     }
0130 
0131     boost::uniform_smallint<int> m_smallint;
0132     boost::mt11213b m_rnd;
0133 };
0134 
0135 template <>
0136 struct RandomGenerator<float>
0137 {
0138     RandomGenerator(int seed)
0139         : m_rnd(seed)
0140     {
0141     }
0142 
0143     float operator() () {
0144         //return float(m_rnd()) / float(m_rnd.max());
0145         return m_smallfloat(m_rnd);
0146     }
0147 
0148     float unit() {
0149         return KoColorSpaceMathsTraits<float>::unitValue;
0150     }
0151 
0152     boost::uniform_real<float> m_smallfloat;
0153     boost::mt11213b m_rnd;
0154 };
0155 
0156 template <>
0157 struct RandomGenerator<double> : RandomGenerator<float>
0158 {
0159     RandomGenerator(int seed)
0160         : RandomGenerator<float>(seed)
0161     {
0162     }
0163 };
0164 
0165 
0166 template <typename channel_type>
0167 void generateDataLine(uint seed, int numPixels, quint8 *srcPixels, quint8 *dstPixels, quint8 *mask, AlphaRange srcAlphaRange, AlphaRange dstAlphaRange)
0168 {
0169     Q_ASSERT(numPixels >= 4);
0170 
0171     RandomGenerator<channel_type> rnd(seed);
0172     RandomGenerator<quint8> maskRnd(seed + 1);
0173 
0174     channel_type *srcArray = reinterpret_cast<channel_type*>(srcPixels);
0175     channel_type *dstArray = reinterpret_cast<channel_type*>(dstPixels);
0176 
0177     for (int i = 0; i < numPixels; i++) {
0178         for (int j = 0; j < 3; j++) {
0179             channel_type s = rnd();
0180             channel_type d = rnd();
0181             *(srcArray++) = s;
0182             *(dstArray++) = d;
0183         }
0184 
0185         channel_type sa = generateAlphaValue<channel_type>(srcAlphaRange, rnd);
0186         channel_type da = generateAlphaValue<channel_type>(dstAlphaRange, rnd);
0187         *(srcArray++) = sa;
0188         *(dstArray++) = da;
0189 
0190         *(mask++) = maskRnd();
0191     }
0192 }
0193 
0194 void printData(int numPixels, quint8 *srcPixels, quint8 *dstPixels, quint8 *mask)
0195 {
0196     for (int i = 0; i < numPixels; i++) {
0197         qDebug() << "Src: "
0198                  << srcPixels[i*4] << "\t"
0199                  << srcPixels[i*4+1] << "\t"
0200                  << srcPixels[i*4+2] << "\t"
0201                  << srcPixels[i*4+3] << "\t"
0202                  << "Msk:" << mask[i];
0203 
0204         qDebug() << "Dst: "
0205                  << dstPixels[i*4] << "\t"
0206                  << dstPixels[i*4+1] << "\t"
0207                  << dstPixels[i*4+2] << "\t"
0208                  << dstPixels[i*4+3];
0209     }
0210 }
0211 
0212 const int rowStride = 64;
0213 const int totalRows = 64;
0214 const QRect processRect(0,0,64,64);
0215 const int numPixels = rowStride * totalRows;
0216 const int numTiles = 1024;
0217 
0218 
0219 struct Tile {
0220     quint8 *src;
0221     quint8 *dst;
0222     quint8 *mask;
0223 };
0224 #include <stdint.h>
0225 QVector<Tile> generateTiles(int size,
0226                             const int srcAlignmentShift,
0227                             const int dstAlignmentShift,
0228                             AlphaRange srcAlphaRange,
0229                             AlphaRange dstAlphaRange,
0230                             const quint32 pixelSize)
0231 {
0232     QVector<Tile> tiles(size);
0233 
0234 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0235     const int vecSize = float_v::size;
0236 #else
0237     const int vecSize = 1;
0238 #endif
0239 
0240     // the 256 are used to make sure that we have a good alignment no matter what build options are used.
0241     const size_t pixelAlignment = qMax(size_t(vecSize * sizeof(float)), size_t(256));
0242     const size_t maskAlignment = qMax(size_t(vecSize), size_t(256));
0243     for (int i = 0; i < size; i++) {
0244         void *ptr = 0;
0245         int error = MEMALIGN_ALLOC(&ptr, pixelAlignment, numPixels * pixelSize + srcAlignmentShift);
0246         if (error) {
0247             qFatal("posix_memalign failed: %d", error);
0248         }
0249         tiles[i].src = (quint8*)ptr + srcAlignmentShift;
0250         error = MEMALIGN_ALLOC(&ptr, pixelAlignment, numPixels * pixelSize + dstAlignmentShift);
0251         if (error) {
0252             qFatal("posix_memalign failed: %d", error);
0253         }
0254         tiles[i].dst = (quint8*)ptr + dstAlignmentShift;
0255         error = MEMALIGN_ALLOC(&ptr, maskAlignment, numPixels);
0256         if (error) {
0257             qFatal("posix_memalign failed: %d", error);
0258         }
0259         tiles[i].mask = (quint8*)ptr;
0260 
0261         if (pixelSize == 4) {
0262             generateDataLine<quint8>(1, numPixels, tiles[i].src, tiles[i].dst, tiles[i].mask, srcAlphaRange, dstAlphaRange);
0263         } else if (pixelSize == 8) {
0264             generateDataLine<quint16>(1, numPixels, tiles[i].src, tiles[i].dst, tiles[i].mask, srcAlphaRange, dstAlphaRange);
0265         } else if (pixelSize == 16) {
0266             generateDataLine<float>(1, numPixels, tiles[i].src, tiles[i].dst, tiles[i].mask, srcAlphaRange, dstAlphaRange);
0267         } else {
0268             qFatal("Pixel size %i is not implemented", pixelSize);
0269         }
0270     }
0271 
0272     return tiles;
0273 }
0274 
0275 void freeTiles(QVector<Tile> tiles,
0276                const int srcAlignmentShift,
0277                const int dstAlignmentShift)
0278 {
0279     Q_FOREACH (const Tile &tile, tiles) {
0280         MEMALIGN_FREE(tile.src - srcAlignmentShift);
0281         MEMALIGN_FREE(tile.dst - dstAlignmentShift);
0282         MEMALIGN_FREE(tile.mask);
0283     }
0284 }
0285 
0286 template <typename channel_type>
0287 inline bool fuzzyCompare(channel_type a, channel_type b, channel_type prec) {
0288     return qAbs(a - b) <= prec;
0289 }
0290 
0291 template<typename channel_type>
0292 struct PixelEqualDirect
0293 {
0294     bool operator() (channel_type c1, channel_type a1,
0295                      channel_type c2, channel_type a2,
0296                      channel_type prec) {
0297 
0298         Q_UNUSED(a1);
0299         Q_UNUSED(a2);
0300 
0301         return fuzzyCompare(c1, c2, prec);
0302     }
0303 };
0304 
0305 template<typename channel_type>
0306 struct PixelEqualPremultiplied
0307 {
0308     bool operator() (channel_type c1, channel_type a1,
0309                      channel_type c2, channel_type a2,
0310                      channel_type prec) {
0311 
0312         c1 = KoColorSpaceMaths<channel_type>::multiply(c1, a1);
0313         c2 = KoColorSpaceMaths<channel_type>::multiply(c2, a2);
0314 
0315         return fuzzyCompare(c1, c2, prec);
0316     }
0317 };
0318 
0319 template <typename channel_type, template<typename> class Compare = PixelEqualDirect>
0320 inline bool comparePixels(channel_type *p1, channel_type *p2, channel_type prec) {
0321     Compare<channel_type> comp;
0322 
0323     return (p1[3] == p2[3] && p1[3] == 0) ||
0324         (comp(p1[0], p1[3], p2[0], p2[3], prec) &&
0325          comp(p1[1], p1[3], p2[1], p2[3], prec) &&
0326          comp(p1[2], p1[3], p2[2], p2[3], prec) &&
0327          fuzzyCompare(p1[3], p2[3], prec));
0328 }
0329 
0330 template <typename channel_type, template<typename> class Compare>
0331 bool compareTwoOpsPixels(QVector<Tile> &tiles, channel_type prec) {
0332     channel_type *dst1 = reinterpret_cast<channel_type*>(tiles[0].dst);
0333     channel_type *dst2 = reinterpret_cast<channel_type*>(tiles[1].dst);
0334 
0335     channel_type *src1 = reinterpret_cast<channel_type*>(tiles[0].src);
0336     channel_type *src2 = reinterpret_cast<channel_type*>(tiles[1].src);
0337 
0338     for (int i = 0; i < numPixels; i++) {
0339         if (!comparePixels<channel_type, Compare>(dst1, dst2, prec)) {
0340             qDebug() << "Wrong result:" << i;
0341             qDebug() << "Act: " << dst1[0] << dst1[1] << dst1[2] << dst1[3];
0342             qDebug() << "Exp: " << dst2[0] << dst2[1] << dst2[2] << dst2[3];
0343             qDebug() << "Dif: " << dst1[0] - dst2[0] << dst1[1] - dst2[1] << dst1[2] - dst2[2] << dst1[3] - dst2[3];
0344 
0345             channel_type *s1 = src1 + 4 * i;
0346             channel_type *s2 = src2 + 4 * i;
0347 
0348             qDebug() << "SrcA:" << s1[0] << s1[1] << s1[2] << s1[3];
0349             qDebug() << "SrcE:" << s2[0] << s2[1] << s2[2] << s2[3];
0350 
0351             qDebug() << "MskA:" << tiles[0].mask[i];
0352             qDebug() << "MskE:" << tiles[1].mask[i];
0353 
0354             return false;
0355         }
0356         dst1 += 4;
0357         dst2 += 4;
0358     }
0359     return true;
0360 }
0361 
0362 template<template<typename> class Compare = PixelEqualDirect>
0363 bool compareTwoOps(bool haveMask, const KoCompositeOp *op1, const KoCompositeOp *op2)
0364 {
0365     Q_ASSERT(op1->colorSpace()->pixelSize() == op2->colorSpace()->pixelSize());
0366     const quint32 pixelSize = op1->colorSpace()->pixelSize();
0367     const int alignment = 16;
0368     QVector<Tile> tiles = generateTiles(2, alignment, alignment, ALPHA_RANDOM, ALPHA_RANDOM, op1->colorSpace()->pixelSize());
0369 
0370     KoCompositeOp::ParameterInfo params;
0371     params.dstRowStride  = 4 * rowStride;
0372     params.srcRowStride  = 4 * rowStride;
0373     params.maskRowStride = rowStride;
0374     params.rows          = processRect.height();
0375     params.cols          = processRect.width();
0376     // This is a hack as in the old version we get a rounding of opacity to this value
0377     params.opacity       = float(Arithmetic::scale<quint8>(0.5*1.0f))/255.0;
0378     params.flow          = 0.3*1.0f;
0379     params.channelFlags  = QBitArray();
0380 
0381     params.dstRowStart   = tiles[0].dst;
0382     params.srcRowStart   = tiles[0].src;
0383     params.maskRowStart  = haveMask ? tiles[0].mask : 0;
0384     op1->composite(params);
0385 
0386     params.dstRowStart   = tiles[1].dst;
0387     params.srcRowStart   = tiles[1].src;
0388     params.maskRowStart  = haveMask ? tiles[1].mask : 0;
0389     op2->composite(params);
0390 
0391     bool compareResult = true;
0392     if (pixelSize == 4) {
0393         compareResult = compareTwoOpsPixels<quint8, Compare>(tiles, 10);
0394     }
0395     else if (pixelSize == 8) {
0396         compareResult = compareTwoOpsPixels<quint16, Compare>(tiles, 90);
0397     }
0398     else if (pixelSize == 16) {
0399         compareResult = compareTwoOpsPixels<float, Compare>(tiles, 2e-6);
0400     }
0401     else {
0402         qFatal("Pixel size %i is not implemented", pixelSize);
0403     }
0404 
0405     freeTiles(tiles, alignment, alignment);
0406 
0407     return compareResult;
0408 }
0409 
0410 QString getTestName(bool haveMask,
0411                     const int srcAlignmentShift,
0412                     const int dstAlignmentShift,
0413                     AlphaRange srcAlphaRange,
0414                     AlphaRange dstAlphaRange)
0415 {
0416 
0417     QString testName;
0418     testName +=
0419         !srcAlignmentShift && !dstAlignmentShift ? "Aligned    " :
0420         !srcAlignmentShift &&  dstAlignmentShift ? "SrcUnalign " :
0421          srcAlignmentShift && !dstAlignmentShift ? "DstUnalign " :
0422          srcAlignmentShift &&  dstAlignmentShift ? "Unaligned  " : "###";
0423 
0424     testName += haveMask ? "Mask   " : "NoMask ";
0425 
0426     testName +=
0427         srcAlphaRange == ALPHA_RANDOM ? "SrcRand " :
0428         srcAlphaRange == ALPHA_ZERO   ? "SrcZero " :
0429         srcAlphaRange == ALPHA_UNIT   ? "SrcUnit " : "###";
0430 
0431     testName +=
0432         dstAlphaRange == ALPHA_RANDOM ? "DstRand" :
0433         dstAlphaRange == ALPHA_ZERO   ? "DstZero" :
0434         dstAlphaRange == ALPHA_UNIT   ? "DstUnit" : "###";
0435 
0436     return testName;
0437 }
0438 
0439 void benchmarkCompositeOp(const KoCompositeOp *op,
0440                           bool haveMask,
0441                           qreal opacity,
0442                           qreal flow,
0443                           const int srcAlignmentShift,
0444                           const int dstAlignmentShift,
0445                           AlphaRange srcAlphaRange,
0446                           AlphaRange dstAlphaRange)
0447 {
0448     QString testName = getTestName(haveMask, srcAlignmentShift, dstAlignmentShift, srcAlphaRange, dstAlphaRange);
0449 
0450     QVector<Tile> tiles =
0451         generateTiles(numTiles, srcAlignmentShift, dstAlignmentShift, srcAlphaRange, dstAlphaRange, op->colorSpace()->pixelSize());
0452 
0453     const int tileOffset = 4 * (processRect.y() * rowStride + processRect.x());
0454 
0455     KoCompositeOp::ParameterInfo params;
0456     params.dstRowStride  = 4 * rowStride;
0457     params.srcRowStride  = 4 * rowStride;
0458     params.maskRowStride = rowStride;
0459     params.rows          = processRect.height();
0460     params.cols          = processRect.width();
0461     params.opacity       = opacity;
0462     params.flow          = flow;
0463     params.channelFlags  = QBitArray();
0464 
0465     QElapsedTimer timer;
0466     timer.start();
0467 
0468     Q_FOREACH (const Tile &tile, tiles) {
0469         params.dstRowStart   = tile.dst + tileOffset;
0470         params.srcRowStart   = tile.src + tileOffset;
0471         params.maskRowStart  = haveMask ? tile.mask : 0;
0472         op->composite(params);
0473     }
0474 
0475     qDebug() << testName << "RESULT:" << timer.elapsed() << "msec";
0476 
0477     freeTiles(tiles, srcAlignmentShift, dstAlignmentShift);
0478 }
0479 
0480 void benchmarkCompositeOp(const KoCompositeOp *op, const QString &postfix)
0481 {
0482     qDebug() << "Testing Composite Op:" << op->id() << "(" << postfix << ")";
0483 
0484     benchmarkCompositeOp(op, true, 0.5, 0.3, 0, 0, ALPHA_RANDOM, ALPHA_RANDOM);
0485     benchmarkCompositeOp(op, true, 0.5, 0.3, 8, 0, ALPHA_RANDOM, ALPHA_RANDOM);
0486     benchmarkCompositeOp(op, true, 0.5, 0.3, 0, 8, ALPHA_RANDOM, ALPHA_RANDOM);
0487     benchmarkCompositeOp(op, true, 0.5, 0.3, 4, 8, ALPHA_RANDOM, ALPHA_RANDOM);
0488 
0489 /// --- Vary the content of the source and destination
0490 
0491     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_RANDOM, ALPHA_RANDOM);
0492     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_ZERO, ALPHA_RANDOM);
0493     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_UNIT, ALPHA_RANDOM);
0494 
0495 /// ---
0496 
0497     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_RANDOM, ALPHA_ZERO);
0498     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_ZERO, ALPHA_ZERO);
0499     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_UNIT, ALPHA_ZERO);
0500 
0501 /// ---
0502 
0503     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_RANDOM, ALPHA_UNIT);
0504     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_ZERO, ALPHA_UNIT);
0505     benchmarkCompositeOp(op, false, 1.0, 1.0, 0, 0, ALPHA_UNIT, ALPHA_UNIT);
0506 }
0507 
0508 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0509 
0510 template <typename channels_type>
0511 void printError(quint8 *s, quint8 *d1, quint8 *d2, quint8 *msk1, int pos)
0512 {
0513     const channels_type *src1 = reinterpret_cast<const channels_type*>(s);
0514     const channels_type *dst1 = reinterpret_cast<const channels_type*>(d1);
0515     const channels_type *dst2 = reinterpret_cast<const channels_type*>(d2);
0516 
0517     qDebug() << "Wrong rounding in pixel:" << pos;
0518     qDebug() << "Vector version: " << dst1[0] << dst1[1] << dst1[2] << dst1[3];
0519     qDebug() << "Scalar version: " << dst2[0] << dst2[1] << dst2[2] << dst2[3];
0520     qDebug() << "Dif: " << dst1[0] - dst2[0] << dst1[1] - dst2[1] << dst1[2] - dst2[2] << dst1[3] - dst2[3];
0521 
0522     qDebug() << "src:" << src1[0] << src1[1] << src1[2] << src1[3];
0523     qDebug() << "msk:" << msk1[0];
0524 }
0525 
0526 template<class Compositor>
0527 void checkRounding(qreal opacity, qreal flow, qreal averageOpacity = -1, quint32 pixelSize = 4)
0528 {
0529     QVector<Tile> tiles =
0530         generateTiles(2, 0, 0, ALPHA_RANDOM, ALPHA_RANDOM, pixelSize);
0531 
0532     const int vecSize = float_v::size;
0533 
0534     const int numBlocks = numPixels / vecSize;
0535 
0536     quint8 *src1 = tiles[0].src;
0537     quint8 *dst1 = tiles[0].dst;
0538     quint8 *msk1 = tiles[0].mask;
0539 
0540     quint8 *src2 = tiles[1].src;
0541     quint8 *dst2 = tiles[1].dst;
0542     quint8 *msk2 = tiles[1].mask;
0543 
0544     KoCompositeOp::ParameterInfo params;
0545     params.opacity = opacity;
0546     params.flow = flow;
0547 
0548     if (averageOpacity >= 0.0) {
0549         params._lastOpacityData = averageOpacity;
0550         params.lastOpacity = &params._lastOpacityData;
0551     }
0552 
0553     params.channelFlags = QBitArray();
0554     typename Compositor::ParamsWrapper paramsWrapper(params);
0555 
0556     // The error count is needed as 38.5 gets rounded to 38 instead of 39 in the vc version.
0557     int errorcount = 0;
0558     for (int i = 0; i < numBlocks; i++) {
0559         Compositor::template compositeVector<true,true, xsimd::current_arch>(src1, dst1, msk1, params.opacity, paramsWrapper);
0560         for (int j = 0; j < vecSize; j++) {
0561 
0562             //if (8 * i + j == 7080) {
0563             //    qDebug() << "src: " << src2[0] << src2[1] << src2[2] << src2[3];
0564             //    qDebug() << "dst: " << dst2[0] << dst2[1] << dst2[2] << dst2[3];
0565             //    qDebug() << "msk:" << msk2[0];
0566             //}
0567 
0568             Compositor::template compositeOnePixelScalar<true, xsimd::current_arch>(src2, dst2, msk2, params.opacity, paramsWrapper);
0569 
0570             bool compareResult = true;
0571             if (pixelSize == 4) {
0572                 compareResult = comparePixels<quint8>(dst1, dst2, 0);
0573                 if (!compareResult) {
0574                     ++errorcount;
0575                     compareResult = comparePixels<quint8>(dst1, dst2, 1);
0576                     if (!compareResult) {
0577                         ++errorcount;
0578                     }
0579                 }
0580             }
0581             else if (pixelSize == 8) {
0582                 compareResult = comparePixels<quint16>(reinterpret_cast<quint16*>(dst1), reinterpret_cast<quint16*>(dst2), 0);
0583             }
0584             else if (pixelSize == 16) {
0585                 compareResult = comparePixels<float>(reinterpret_cast<float*>(dst1), reinterpret_cast<float*>(dst2), 0);
0586             }
0587             else {
0588                 qFatal("Pixel size %i is not implemented", pixelSize);
0589             }
0590 
0591             if(!compareResult || errorcount > 1) {
0592                 if (pixelSize == 4) {
0593                     printError<quint8>(src1, dst1, dst2, msk1, 8 * i + j);
0594                 } else if (pixelSize == 8) {
0595                     printError<quint16>(src1, dst1, dst2, msk1, 8 * i + j);
0596                 } else if (pixelSize == 16) {
0597                     printError<float>(src1, dst1, dst2, msk1, 8 * i + j);
0598                 } else {
0599                     qFatal("Pixel size %i is not implemented", pixelSize);
0600                 }
0601 
0602                 QFAIL("Wrong rounding");
0603             }
0604 
0605             src1 += pixelSize;
0606             dst1 += pixelSize;
0607             src2 += pixelSize;
0608             dst2 += pixelSize;
0609             msk1++;
0610             msk2++;
0611         }
0612     }
0613 
0614     freeTiles(tiles, 0, 0);
0615 }
0616 
0617 #endif
0618 
0619 
0620 void KisCompositionBenchmark::detectBuildArchitecture()
0621 {
0622 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0623     using namespace xsimd;
0624 
0625     qDebug() << "built for" << ppVar(current_arch().name());
0626     qDebug() << "built for" << ppVar(default_arch().name());
0627 
0628     qDebug() << ppVar(supported_architectures().contains<sse2>());
0629     qDebug() << ppVar(supported_architectures().contains<sse3>());
0630     qDebug() << ppVar(supported_architectures().contains<ssse3>());
0631     qDebug() << ppVar(supported_architectures().contains<sse4_1>());
0632     qDebug() << ppVar(supported_architectures().contains<sse4_2>());
0633     qDebug() << ppVar(supported_architectures().contains<fma3<sse4_2>>());
0634 
0635     qDebug() << ppVar(supported_architectures().contains<avx>());
0636     qDebug() << ppVar(supported_architectures().contains<avx2>());
0637     qDebug() << ppVar(supported_architectures().contains<fma3<avx2>>());
0638     qDebug() << ppVar(supported_architectures().contains<fma4>());
0639     qDebug() << ppVar(supported_architectures().contains<avx512f>());
0640     qDebug() << ppVar(supported_architectures().contains<avx512bw>());
0641     qDebug() << ppVar(supported_architectures().contains<avx512dq>());
0642     qDebug() << ppVar(supported_architectures().contains<avx512cd>());
0643     qDebug().nospace() << "running on " << hex << "0x" << xsimd::available_architectures().best;
0644 #endif
0645 }
0646 
0647 void KisCompositionBenchmark::checkRoundingAlphaDarken_05_03()
0648 {
0649 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0650     checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,0.3);
0651 #endif
0652 }
0653 
0654 void KisCompositionBenchmark::checkRoundingAlphaDarken_05_05()
0655 {
0656 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0657     checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,0.5);
0658 #endif
0659 }
0660 
0661 void KisCompositionBenchmark::checkRoundingAlphaDarken_05_07()
0662 {
0663 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0664     checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,0.7);
0665 #endif
0666 }
0667 
0668 void KisCompositionBenchmark::checkRoundingAlphaDarken_05_10()
0669 {
0670 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0671     checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,1.0);
0672 #endif
0673 }
0674 
0675 void KisCompositionBenchmark::checkRoundingAlphaDarken_05_10_08()
0676 {
0677 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0678     checkRounding<AlphaDarkenCompositor32<quint8, quint32, KoAlphaDarkenParamsWrapperCreamy> >(0.5,1.0,0.8);
0679 #endif
0680 }
0681 
0682 void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_03()
0683 {
0684 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0685     checkRounding<OverCompositor128<float, false, true> >(0.5, 0.3, -1, 16);
0686 #endif
0687 }
0688 
0689 void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_05()
0690 {
0691 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0692     checkRounding<OverCompositor128<float, false, true> >(0.5, 0.5, -1, 16);
0693 #endif
0694 }
0695 
0696 void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_07()
0697 {
0698 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0699     checkRounding<OverCompositor128<float, false, true> >(0.5, 0.7, -1, 16);
0700 #endif
0701 }
0702 
0703 void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_10()
0704 {
0705 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0706     checkRounding<OverCompositor128<float, false, true> >(0.5, 1.0, -1, 16);
0707 #endif
0708 }
0709 
0710 void KisCompositionBenchmark::checkRoundingAlphaDarkenF32_05_10_08()
0711 {
0712 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0713     checkRounding<OverCompositor128<float, false, true> >(0.5, 1.0, 0.8, 16);
0714 #endif
0715 }
0716 
0717 void KisCompositionBenchmark::checkRoundingOver()
0718 {
0719 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0720     checkRounding<OverCompositor32<quint8, quint32, false, true> >(0.5, 0.3);
0721 #endif
0722 }
0723 
0724 void KisCompositionBenchmark::checkRoundingOverRgbaU16()
0725 {
0726 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0727     checkRounding<OverCompositor128<quint16, false, true> >(0.5, 1.0, -1, 8);
0728 #endif
0729 }
0730 
0731 void KisCompositionBenchmark::checkRoundingOverRgbaF32()
0732 {
0733 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0734     checkRounding<OverCompositor128<float, false, true> >(0.5, 1.0, -1, 16);
0735 #endif
0736 }
0737 #include <cfenv>
0738 void KisCompositionBenchmark::checkRoundingCopyRgbaU16()
0739 {
0740 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0741     checkRounding<CopyCompositor128<quint16, false, true> >(0.5, 1.0, -1, 8);
0742 #endif
0743 }
0744 
0745 void KisCompositionBenchmark::checkRoundingCopyRgbaF32()
0746 {
0747 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
0748     checkRounding<CopyCompositor128<float, false, true> >(0.5, 1.0, -1, 16);
0749 #endif
0750 }
0751 
0752 void KisCompositionBenchmark::compareAlphaDarkenOps()
0753 {
0754     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0755     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createAlphaDarkenOpCreamy32(cs);
0756     KoCompositeOp *opExp = new KoCompositeOpAlphaDarken<KoBgrU8Traits, KoAlphaDarkenParamsWrapperCreamy>(cs);
0757 
0758     QVERIFY(compareTwoOps(true, opAct, opExp));
0759 
0760     delete opExp;
0761     delete opAct;
0762 }
0763 
0764 void KisCompositionBenchmark::compareRgbF32AlphaDarkenOps()
0765 {
0766     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
0767     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createAlphaDarkenOpCreamy128(cs);
0768     KoCompositeOp *opExp = new KoCompositeOpAlphaDarken<KoRgbF32Traits, KoAlphaDarkenParamsWrapperCreamy>(cs);
0769 
0770     QVERIFY(compareTwoOps(true, opAct, opExp));
0771 
0772     delete opExp;
0773     delete opAct;
0774 }
0775 
0776 void KisCompositionBenchmark::compareAlphaDarkenOpsNoMask()
0777 {
0778     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0779     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createAlphaDarkenOpCreamy32(cs);
0780     KoCompositeOp *opExp = new KoCompositeOpAlphaDarken<KoBgrU8Traits, KoAlphaDarkenParamsWrapperCreamy>(cs);
0781 
0782     QVERIFY(compareTwoOps(false, opAct, opExp));
0783 
0784     delete opExp;
0785     delete opAct;
0786 }
0787 
0788 void KisCompositionBenchmark::compareRgbU16AlphaDarkenOps()
0789 {
0790     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0791     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createAlphaDarkenOpCreamyU64(cs);
0792     KoCompositeOp *opExp = new KoCompositeOpAlphaDarken<KoRgbU16Traits, KoAlphaDarkenParamsWrapperCreamy>(cs);
0793 
0794     QVERIFY(compareTwoOps(true, opAct, opExp));
0795 
0796     delete opExp;
0797     delete opAct;
0798 }
0799 
0800 void KisCompositionBenchmark::compareOverOps()
0801 {
0802     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0803     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createOverOp32(cs);
0804     KoCompositeOp *opExp = new KoCompositeOpOver<KoBgrU8Traits>(cs);
0805 
0806     QVERIFY(compareTwoOps(true, opAct, opExp));
0807 
0808     delete opExp;
0809     delete opAct;
0810 }
0811 
0812 void KisCompositionBenchmark::compareOverOpsNoMask()
0813 {
0814     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0815     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createOverOp32(cs);
0816     KoCompositeOp *opExp = new KoCompositeOpOver<KoBgrU8Traits>(cs);
0817 
0818     QVERIFY(compareTwoOps(false, opAct, opExp));
0819 
0820     delete opExp;
0821     delete opAct;
0822 }
0823 
0824 void KisCompositionBenchmark::compareRgbU16OverOps()
0825 {
0826     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0827     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createOverOpU64(cs);
0828     KoCompositeOp *opExp = new KoCompositeOpOver<KoRgbU16Traits>(cs);
0829 
0830     QVERIFY(compareTwoOps(false, opAct, opExp));
0831 
0832     delete opExp;
0833     delete opAct;
0834 }
0835 
0836 void KisCompositionBenchmark::compareRgbF32OverOps()
0837 {
0838     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
0839     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createOverOp128(cs);
0840     KoCompositeOp *opExp = new KoCompositeOpOver<KoRgbF32Traits>(cs);
0841 
0842     QVERIFY(compareTwoOps(false, opAct, opExp));
0843 
0844     delete opExp;
0845     delete opAct;
0846 }
0847 
0848 void KisCompositionBenchmark::compareRgbU8CopyOps()
0849 {
0850     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0851     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createCopyOp32(cs);
0852     KoCompositeOp *opExp = new KoCompositeOpCopy2<KoRgbU8Traits>(cs);
0853 
0854     // Since composite copy involves a channel division operation,
0855     // there might be significant rounding difference with purely
0856     // integer implementation. So we should compare in premultiplied
0857     // form.
0858     QVERIFY(compareTwoOps<PixelEqualPremultiplied>(false, opAct, opExp));
0859 
0860     delete opExp;
0861     delete opAct;
0862 }
0863 
0864 void KisCompositionBenchmark::compareRgbU16CopyOps()
0865 {
0866     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0867     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createCopyOpU64(cs);
0868     KoCompositeOp *opExp = new KoCompositeOpCopy2<KoRgbU16Traits>(cs);
0869 
0870     QVERIFY(compareTwoOps(false, opAct, opExp));
0871 
0872     delete opExp;
0873     delete opAct;
0874 }
0875 
0876 void KisCompositionBenchmark::compareRgbF32CopyOps()
0877 {
0878     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
0879     KoCompositeOp *opAct = KoOptimizedCompositeOpFactory::createCopyOp128(cs);
0880     KoCompositeOp *opExp = new KoCompositeOpCopy2<KoRgbF32Traits>(cs);
0881 
0882     QVERIFY(compareTwoOps(false, opAct, opExp));
0883 
0884     delete opExp;
0885     delete opAct;
0886 }
0887 
0888 void KisCompositionBenchmark::testRgb8CompositeAlphaDarkenLegacy()
0889 {
0890     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0891     KoCompositeOp *op = new KoCompositeOpAlphaDarken<KoBgrU8Traits, KoAlphaDarkenParamsWrapperCreamy>(cs);
0892     benchmarkCompositeOp(op, "Legacy");
0893     delete op;
0894 }
0895 
0896 void KisCompositionBenchmark::testRgb8CompositeAlphaDarkenOptimized()
0897 {
0898     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0899     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createAlphaDarkenOpCreamy32(cs);
0900     benchmarkCompositeOp(op, "Optimized");
0901     delete op;
0902 }
0903 
0904 void KisCompositionBenchmark::testRgb8CompositeOverLegacy()
0905 {
0906     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0907     KoCompositeOp *op = new KoCompositeOpOver<KoBgrU8Traits>(cs);
0908     benchmarkCompositeOp(op, "Legacy");
0909     delete op;
0910 }
0911 
0912 void KisCompositionBenchmark::testRgb8CompositeOverOptimized()
0913 {
0914     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
0915     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createOverOp32(cs);
0916     benchmarkCompositeOp(op, "Optimized");
0917     delete op;
0918 }
0919 
0920 void KisCompositionBenchmark::testRgb16CompositeAlphaDarkenLegacy()
0921 {
0922     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0923     KoCompositeOp *op = new KoCompositeOpAlphaDarken<KoBgrU16Traits, KoAlphaDarkenParamsWrapperCreamy>(cs);
0924     benchmarkCompositeOp(op, "Legacy");
0925     delete op;
0926 }
0927 
0928 void KisCompositionBenchmark::testRgb16CompositeAlphaDarkenOptimized()
0929 {
0930     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0931     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createAlphaDarkenOpCreamyU64(cs);
0932     benchmarkCompositeOp(op, "Optimized");
0933     delete op;
0934 }
0935 
0936 void KisCompositionBenchmark::testRgb16CompositeOverLegacy()
0937 {
0938     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0939     KoCompositeOp *op = new KoCompositeOpOver<KoBgrU16Traits>(cs);
0940     benchmarkCompositeOp(op, "Legacy");
0941     delete op;
0942 }
0943 
0944 void KisCompositionBenchmark::testRgb16CompositeOverOptimized()
0945 {
0946     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0947     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createOverOpU64(cs);
0948     benchmarkCompositeOp(op, "Optimized");
0949     delete op;
0950 }
0951 
0952 
0953 void KisCompositionBenchmark::testRgb16CompositeCopyLegacy()
0954 {
0955     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0956     KoCompositeOp *op = new KoCompositeOpCopy2<KoBgrU16Traits>(cs);
0957     benchmarkCompositeOp(op, "Legacy");
0958     delete op;
0959 }
0960 
0961 void KisCompositionBenchmark::testRgb16CompositeCopyOptimized()
0962 {
0963     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb16();
0964     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createCopyOpU64(cs);
0965     benchmarkCompositeOp(op, "Optimized");
0966     delete op;
0967 }
0968 void KisCompositionBenchmark::testRgbF32CompositeAlphaDarkenLegacy()
0969 {
0970     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
0971     KoCompositeOp *op = new KoCompositeOpAlphaDarken<KoRgbF32Traits, KoAlphaDarkenParamsWrapperCreamy>(cs);
0972     benchmarkCompositeOp(op, "Legacy");
0973     delete op;
0974 }
0975 
0976 void KisCompositionBenchmark::testRgbF32CompositeAlphaDarkenOptimized()
0977 {
0978     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
0979     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createAlphaDarkenOpCreamy128(cs);
0980     benchmarkCompositeOp(op, "Optimized");
0981     delete op;
0982 }
0983 
0984 void KisCompositionBenchmark::testRgbF32CompositeOverLegacy()
0985 {
0986     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
0987     KoCompositeOp *op = new KoCompositeOpOver<KoRgbF32Traits>(cs);
0988     benchmarkCompositeOp(op, "RGBF32 Legacy");
0989     delete op;
0990 }
0991 
0992 void KisCompositionBenchmark::testRgbF32CompositeOverOptimized()
0993 {
0994     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
0995     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createOverOp128(cs);
0996     benchmarkCompositeOp(op, "RGBF32 Optimized");
0997     delete op;
0998 }
0999 
1000 void KisCompositionBenchmark::testRgbF32CompositeCopyLegacy()
1001 {
1002     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
1003     KoCompositeOp *op = new KoCompositeOpCopy2<KoRgbF32Traits>(cs);
1004     benchmarkCompositeOp(op, "RGBF32 Legacy");
1005     delete op;
1006 }
1007 
1008 void KisCompositionBenchmark::testRgbF32CompositeCopyOptimized()
1009 {
1010     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->colorSpace("RGBA", "F32", "");
1011     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createCopyOp128(cs);
1012     benchmarkCompositeOp(op, "RGBF32 Optimized");
1013     delete op;
1014 }
1015 
1016 void KisCompositionBenchmark::testRgb8CompositeAlphaDarkenReal_Aligned()
1017 {
1018     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
1019     const KoCompositeOp *op = cs->compositeOp(COMPOSITE_ALPHA_DARKEN);
1020     benchmarkCompositeOp(op, true, 0.5, 0.3, 0, 0, ALPHA_RANDOM, ALPHA_RANDOM);
1021 }
1022 
1023 void KisCompositionBenchmark::testRgb8CompositeOverReal_Aligned()
1024 {
1025     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
1026     const KoCompositeOp *op = cs->compositeOp(COMPOSITE_OVER);
1027     benchmarkCompositeOp(op, true, 0.5, 0.3, 0, 0, ALPHA_RANDOM, ALPHA_RANDOM);
1028 }
1029 
1030 void KisCompositionBenchmark::testRgb8CompositeCopyLegacy()
1031 {
1032     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
1033     KoCompositeOp *op = new KoCompositeOpCopy2<KoBgrU8Traits>(cs);
1034     benchmarkCompositeOp(op, "Copy");
1035     delete op;
1036 }
1037 
1038 void KisCompositionBenchmark::testRgb8CompositeCopyOptimized()
1039 {
1040     const KoColorSpace *cs = KoColorSpaceRegistry::instance()->rgb8();
1041     KoCompositeOp *op = KoOptimizedCompositeOpFactory::createCopyOp32(cs);
1042     benchmarkCompositeOp(op, "Optimized");
1043     delete op;
1044 }
1045 
1046 void KisCompositionBenchmark::benchmarkMemcpy()
1047 {
1048     QVector<Tile> tiles =
1049         generateTiles(numTiles, 0, 0, ALPHA_UNIT, ALPHA_UNIT, 4);
1050 
1051     QBENCHMARK_ONCE {
1052         Q_FOREACH (const Tile &tile, tiles) {
1053             memcpy(tile.dst, tile.src, 4 * numPixels);
1054         }
1055     }
1056 
1057     freeTiles(tiles, 0, 0);
1058 }
1059 
1060 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
1061 const int vecSize = float_v::size;
1062 const size_t uint8VecAlignment = qMax(vecSize * sizeof(quint8), sizeof(void *));
1063 const size_t uint32VecAlignment = qMax(vecSize * sizeof(quint32), sizeof(void *));
1064 const size_t floatVecAlignment = qMax(vecSize * sizeof(float), sizeof(void *));
1065 #endif
1066 
1067 void KisCompositionBenchmark::benchmarkUintFloat()
1068 {
1069 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
1070     using uint_v = xsimd::batch<unsigned int, xsimd::current_arch>;
1071 
1072     const int dataSize = 4096;
1073     void *ptr = 0;
1074     int error = MEMALIGN_ALLOC(&ptr, uint8VecAlignment, dataSize);
1075     if (error) {
1076         qFatal("posix_memalign failed: %d", error);
1077     }
1078     quint8 *iData = (quint8*)ptr;
1079     error = MEMALIGN_ALLOC(&ptr, floatVecAlignment, dataSize * sizeof(float));
1080     if (error) {
1081         qFatal("posix_memalign failed: %d", error);
1082     }
1083     float *fData = (float*)ptr;
1084 
1085     QBENCHMARK {
1086         for (int i = 0; i < dataSize; i += float_v::size) {
1087             // convert uint -> float directly, this causes
1088             // static_cast helper be called
1089             const auto b = xsimd::batch_cast<typename float_v::value_type>(
1090                 xsimd::load_and_extend<uint_v>(iData + i)
1091             );
1092             b.store_aligned(fData + i);
1093         }
1094     }
1095 
1096     MEMALIGN_FREE(iData);
1097     MEMALIGN_FREE(fData);
1098 #endif
1099 }
1100 
1101 void KisCompositionBenchmark::benchmarkUintIntFloat()
1102 {
1103 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
1104     using uint_v = xsimd::batch<unsigned int, xsimd::current_arch>;
1105 
1106     const int dataSize = 4096;
1107     void *ptr = 0;
1108     int error = MEMALIGN_ALLOC(&ptr, uint8VecAlignment, dataSize);
1109     if (error) {
1110         qFatal("posix_memalign failed: %d", error);
1111     }
1112     quint8 *iData = (quint8*)ptr;
1113     error = MEMALIGN_ALLOC(&ptr, floatVecAlignment, dataSize * sizeof(float));
1114     if (error) {
1115         qFatal("posix_memalign failed: %d", error);
1116     }
1117     float *fData = (float*)ptr;
1118 
1119     QBENCHMARK {
1120         for (int i = 0; i < dataSize; i += float_v::size) {
1121             // convert uint->int->float, that avoids special sign
1122             // treating, and gives 2.6 times speedup
1123             const auto b = xsimd::batch_cast<typename float_v::value_type>(xsimd::load_and_extend<uint_v>(iData + i));
1124             b.store_aligned(fData + i);
1125         }
1126     }
1127 
1128     MEMALIGN_FREE(iData);
1129     MEMALIGN_FREE(fData);
1130 #endif
1131 }
1132 
1133 void KisCompositionBenchmark::benchmarkFloatUint()
1134 {
1135 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
1136     using uint_v = xsimd::batch<unsigned int, xsimd::current_arch>;
1137 
1138     const int dataSize = 4096;
1139     void *ptr = 0;
1140     int error = MEMALIGN_ALLOC(&ptr, uint32VecAlignment, dataSize * sizeof(quint32));
1141     if (error) {
1142         qFatal("posix_memalign failed: %d", error);
1143     }
1144     quint32 *iData = (quint32*)ptr;
1145     error = MEMALIGN_ALLOC(&ptr, floatVecAlignment, dataSize * sizeof(float));
1146     if (error) {
1147         qFatal("posix_memalign failed: %d", error);
1148     }
1149     float *fData = (float*)ptr;
1150 
1151     QBENCHMARK {
1152         for (int i = 0; i < dataSize; i += float_v::size) {
1153             // conversion float -> uint
1154             // this being a direct conversion, load_and_extend does not apply
1155             const auto b = xsimd::batch_cast<typename uint_v::value_type>(float_v::load_aligned(fData + i));
1156 
1157             b.store_aligned(iData + i);
1158         }
1159     }
1160 
1161     MEMALIGN_FREE(iData);
1162     MEMALIGN_FREE(fData);
1163 #endif
1164 }
1165 
1166 void KisCompositionBenchmark::benchmarkFloatIntUint()
1167 {
1168 #if defined(HAVE_XSIMD) && !defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) && XSIMD_UNIVERSAL_BUILD_PASS
1169     using uint_v = xsimd::batch<unsigned int, xsimd::current_arch>;
1170     const int dataSize = 4096;
1171     void *ptr = 0;
1172     int error = MEMALIGN_ALLOC(&ptr, uint32VecAlignment, dataSize * sizeof(quint32));
1173     if (error) {
1174         qFatal("posix_memalign failed: %d", error);
1175     }
1176     quint32 *iData = (quint32*)ptr;
1177     error = MEMALIGN_ALLOC(&ptr, floatVecAlignment, dataSize * sizeof(float));
1178     if (error) {
1179         qFatal("posix_memalign failed: %d", error);
1180     }
1181     float *fData = (float*)ptr;
1182 
1183     QBENCHMARK {
1184         for (int i = 0; i < dataSize; i += float_v::size) {
1185             // conversion float -> int -> uint
1186             const auto b = xsimd::batch_cast<typename uint_v::value_type>(float_v::load_aligned(fData + i));
1187 
1188             b.store_aligned(iData + i);
1189         }
1190     }
1191 
1192     MEMALIGN_FREE(iData);
1193     MEMALIGN_FREE(fData);
1194 #endif
1195 }
1196 
1197 SIMPLE_TEST_MAIN(KisCompositionBenchmark)
1198