File indexing completed on 2024-04-21 05:44:09

0001 /*
0002     SPDX-FileCopyrightText: 2003 Otto Bruggeman <bruggie@home.nl>
0003     SPDX-FileCopyrightText: 2011 Dmitry Risenberg <dmitry.risenberg@gmail.com>
0004 
0005     SPDX-License-Identifier: GPL-2.0-or-later
0006 */
0007 
0008 #ifndef KOMPAREDIFF2_LEVENSHTEINTABLE_H
0009 #define KOMPAREDIFF2_LEVENSHTEINTABLE_H
0010 
0011 // #include <QString>
0012 // #include <komparediff2_logging.h>
0013 // Std
0014 #include <iostream>
0015 #include <memory>
0016 #include <vector>
0017 
0018 namespace KompareDiff2
0019 {
0020 
0021 class Marker;
0022 
0023 class Marker;
0024 
0025 /**
0026  * Computes the Levenshtein distance between two sequences.
0027  * The actual sequence contents must be prepended with one virtual item each for easier index access.
0028  */
0029 template<class SequencePair>
0030 class LevenshteinTable
0031 {
0032 public:
0033     LevenshteinTable();
0034     LevenshteinTable(unsigned int width, unsigned int height);
0035     ~LevenshteinTable() = default;
0036 
0037     LevenshteinTable(const LevenshteinTable &table) = delete;
0038     const LevenshteinTable &operator=(const LevenshteinTable &table) = delete;
0039 
0040 public:
0041     int getContent(unsigned int posX, unsigned int posY) const;
0042     int setContent(unsigned int posX, unsigned int posY, int value);
0043     bool setSize(unsigned int width, unsigned int height);
0044 
0045     unsigned int width() const
0046     {
0047         return m_width;
0048     };
0049     unsigned int height() const
0050     {
0051         return m_height;
0052     };
0053 
0054     /** Debug method  to check if the table is properly filled */
0055     void dumpLevenshteinTable();
0056 
0057     /**
0058      * This calculates the levenshtein distance of 2 sequences.
0059      * This object takes ownership of the argument
0060      */
0061     unsigned int createTable(SequencePair *sequences);
0062 
0063     void createListsOfMarkers();
0064     int chooseRoute(int c1, int c2, int c3, int current);
0065 
0066 private:
0067     unsigned int m_width = 256;
0068     unsigned int m_height = 256;
0069     unsigned int m_size;
0070     std::vector<unsigned int> m_table;
0071     std::unique_ptr<SequencePair> m_sequences;
0072 };
0073 
0074 template<class SequencePair>
0075 LevenshteinTable<SequencePair>::LevenshteinTable()
0076     : m_size(m_height * m_width)
0077     , m_table(m_size)
0078 {
0079 }
0080 
0081 template<class SequencePair>
0082 LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height)
0083     : m_width(width)
0084     , m_height(height)
0085     , m_size(m_width * m_height)
0086     , m_table(m_size)
0087 {
0088 }
0089 
0090 template<class SequencePair>
0091 int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const
0092 {
0093 //     qCDebug(KOMPAREDIFF2_LOG) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY;
0094     return m_table[posY * m_width + posX];
0095 }
0096 
0097 template<class SequencePair>
0098 int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value)
0099 {
0100     m_table[posY * m_width + posX] = value;
0101 
0102     return 0;
0103 }
0104 
0105 template<class SequencePair>
0106 bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height)
0107 {
0108     // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty
0109     if (((width) * (height)) > (256 * 256 * 256))
0110         return false;
0111 
0112     if (((width) * (height)) > m_size) {
0113         m_size = width * height;
0114         m_table.resize(m_size);
0115     }
0116 
0117     m_width = width;
0118     m_height = height;
0119 
0120     return true;
0121 }
0122 
0123 template<class SequencePair>
0124 void LevenshteinTable<SequencePair>::dumpLevenshteinTable()
0125 {
0126     for (unsigned int i = 0; i < m_height; ++i) {
0127         for (unsigned int j = 0; j < m_width; ++j) {
0128             std::cout.width(3);
0129             std::cout << getContent(j, i);
0130         }
0131         std::cout << std::endl;
0132     }
0133 }
0134 
0135 template<class SequencePair>
0136 unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair *sequences)
0137 {
0138     m_sequences.reset(sequences);
0139     unsigned int m = m_sequences->lengthFirst();
0140     unsigned int n = m_sequences->lengthSecond();
0141 
0142     if (!setSize(m, n))
0143         return 0;
0144 
0145     unsigned int i;
0146     unsigned int j;
0147 
0148     // initialize first row
0149     for (i = 0; i < m; ++i)
0150         setContent(i, 0, i);
0151     // initialize first column
0152     for (j = 0; j < n; ++j)
0153         setContent(0, j, j);
0154 
0155     int cost = 0, north = 0, west = 0, northwest = 0;
0156 
0157     // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings
0158     for (j = 1; j < n; ++j) {
0159         for (i = 1; i < m; ++i) {
0160             if (m_sequences->equal(i, j))
0161                 cost = 0;
0162             else
0163                 cost = SequencePair::allowReplace ? 1 : 2;
0164 
0165             north = getContent(i, j - 1) + 1;
0166             west = getContent(i - 1, j) + 1;
0167             northwest = getContent(i - 1, j - 1) + cost;
0168 
0169             setContent(i, j, qMin(north, qMin(west, northwest)));
0170         }
0171     }
0172 
0173     return getContent(m - 1, n - 1);
0174 }
0175 
0176 template<class SequencePair>
0177 int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current)
0178 {
0179 //     qCDebug(KOMPAREDIFF2_LOG) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3;
0180     // preference order: c2, c3, c1, hopefully this will work out for me
0181     if (c2 <= c1 && c2 <= c3) {
0182         if (SequencePair::allowReplace || (c2 == current)) {
0183             return 1;
0184         }
0185     }
0186 
0187     if (c3 <= c2 && c3 <= c1)
0188         return 2;
0189 
0190     return 0;
0191 }
0192 
0193 template<class SequencePair>
0194 void LevenshteinTable<SequencePair>::createListsOfMarkers()
0195 {
0196 //     qCDebug(KOMPAREDIFF2_LOG) << source;
0197 //     qCDebug(KOMPAREDIFF2_LOG) << destination;
0198 //     dumpLevenshteinTable();
0199 
0200     unsigned int x = m_width - 1;
0201     unsigned int y = m_height - 1;
0202 
0203     unsigned int difference = getContent(x, y);
0204 
0205     // If the number of differences is more than half the length of the largest string
0206     // don't bother to mark the individual changes
0207     // Patch based on work by Felix Berger as put as attachment to bug 75794
0208     if (!m_sequences->needFineGrainedOutput(difference)) {
0209         m_sequences->prependFirst(new Marker(Marker::End, x));
0210         m_sequences->prependFirst(new Marker(Marker::Start, 0));
0211         m_sequences->prependSecond(new Marker(Marker::End, y));
0212         m_sequences->prependSecond(new Marker(Marker::Start, 0));
0213         return;
0214     }
0215 
0216     Marker *c = nullptr;
0217 
0218     int n, nw, w, direction, currentValue;
0219     while (x > 0 && y > 0) {
0220         currentValue = getContent(x, y);
0221 
0222         n = getContent(x, y - 1);
0223         w = getContent(x - 1, y);
0224         nw = getContent(x - 1, y - 1);
0225         direction = chooseRoute(n, nw, w, currentValue);
0226 
0227         switch (direction) {
0228         case 0: // north
0229 //             qCDebug(KOMPAREDIFF2_LOG) << "Picking north";
0230 //             qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
0231 
0232             if (!m_sequences->markerListSecond().isEmpty())
0233                 c = m_sequences->markerListSecond().first();
0234             else
0235                 c = nullptr;
0236 
0237             if (c && c->type() == Marker::End) {
0238 //                 qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
0239                 if (n == currentValue)
0240                     m_sequences->prependSecond(new Marker(Marker::Start, y));
0241             // else: the change continues, do not do anything
0242             } else {
0243 //                 qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
0244                 if (n < currentValue)
0245                     m_sequences->prependSecond(new Marker(Marker::End, y));
0246             }
0247 
0248             --y;
0249             if (y == 0) {
0250                 m_sequences->prependSecond(new Marker(Marker::Start, 0));
0251             }
0252             break;
0253         case 1: // northwest
0254 //             qCDebug(KOMPAREDIFF2_LOG) << "Picking northwest";
0255 //             qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
0256 
0257             if (!m_sequences->markerListSecond().isEmpty())
0258                 c = m_sequences->markerListSecond().first();
0259             else
0260                 c = nullptr;
0261 
0262             if (c && c->type() == Marker::End) {
0263 //                 qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue;
0264                 if (nw == currentValue)
0265                     m_sequences->prependSecond(new Marker(Marker::Start, y));
0266             // else: the change continues, do not do anything
0267             } else {
0268 //                 qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
0269                 if (nw < currentValue)
0270                     m_sequences->prependSecond(new Marker(Marker::End, y));
0271             }
0272 
0273             if (!m_sequences->markerListFirst().isEmpty())
0274                 c = m_sequences->markerListFirst().first();
0275             else
0276                 c = nullptr;
0277 
0278             if (c && c->type() == Marker::End) {
0279 //                 qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue;
0280                 if (nw == currentValue)
0281                     m_sequences->prependFirst(new Marker(Marker::Start, x));
0282             // else: the change continues, do not do anything
0283             } else {
0284 //                 qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
0285                 if (nw < currentValue)
0286                     m_sequences->prependFirst(new Marker(Marker::End, x));
0287             }
0288 
0289             --y;
0290             --x;
0291             break;
0292         case 2: // west
0293 //             qCDebug(KOMPAREDIFF2_LOG) << "Picking west";
0294 //             qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
0295 
0296             if (!m_sequences->markerListFirst().isEmpty())
0297                 c = m_sequences->markerListFirst().first();
0298             else
0299                 c = nullptr;
0300 
0301             if (c && c->type() == Marker::End) {
0302 //                 qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue;
0303                 if (w == currentValue)
0304                     m_sequences->prependFirst(new Marker(Marker::Start, x));
0305                 // else: the change continues, do not do anything
0306             } else {
0307 //                 qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue;
0308                 if (w < currentValue)
0309                     m_sequences->prependFirst(new Marker(Marker::End, x));
0310             }
0311 
0312             --x;
0313             if (x == 0) {
0314                 m_sequences->prependFirst(new Marker(Marker::Start, 0));
0315             }
0316             break;
0317         }
0318     }
0319 
0320     // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now.
0321     if (x != 0) {
0322         m_sequences->prependFirst(new Marker(Marker::End, x));
0323         m_sequences->prependFirst(new Marker(Marker::Start, 0));
0324     }
0325 
0326     if (y != 0) {
0327         m_sequences->prependSecond(new Marker(Marker::End, y));
0328         m_sequences->prependSecond(new Marker(Marker::Start, 0));
0329     }
0330 
0331 //     qCDebug(KOMPAREDIFF2_LOG) << "Source string: " << source;
0332 
0333 //     QStringList list;
0334 //     int prevValue = 0;
0335 //     MarkerListConstIterator mit = m_sequences->markerListFirst().begin();
0336 //     MarkerListConstIterator end = m_sequences->markerListFirst().end();
0337 //     for ( ; mit != end; ++mit )
0338 //     {
0339 //         c = *mit;
0340 //         qCDebug(KOMPAREDIFF2_LOG) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
0341 //         list.append( source.mid( prevValue, c->offset() - prevValue ) );
0342 //         prevValue = c->offset();
0343 //     }
0344 //     if ( prevValue < source.length() - 1 )
0345 //     {
0346 //         list.append( source.mid( prevValue, source.length() - prevValue ) );
0347 //     }
0348 //     qCDebug(KOMPAREDIFF2_LOG) << "Source Resulting stringlist : " << list.join("\n");
0349 //
0350 //     list.clear();
0351 //     prevValue = 0;
0352 //
0353 //     qCDebug(KOMPAREDIFF2_LOG) << "Destination string: " << destination;
0354 //     mit = m_sequences->markerListSecond().begin();
0355 //     end = m_sequences->markerListSecond().end();
0356 //     for ( ; mit != end; ++mit )
0357 //     {
0358 //         c = *mit;
0359 //         qCDebug(KOMPAREDIFF2_LOG) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
0360 //         list.append( destination.mid( prevValue, c->offset() - prevValue ) );
0361 //         prevValue = c->offset();
0362 //     }
0363 //     if ( prevValue < destination.length() - 1 )
0364 //     {
0365 //         list.append( destination.mid( prevValue, destination.length() - prevValue ) );
0366 //     }
0367 //     qCDebug(KOMPAREDIFF2_LOG) << "Destination Resulting string : " << list.join("\n");
0368 }
0369 
0370 } // namespace KompareDiff2
0371 
0372 #endif // KOMPAREDIFF2_LEVENSHTEINTABLE_H