File indexing completed on 2024-04-28 09:39:20

0001 /*
0002 SPDX-FileCopyrightText: 2003 Otto Bruggeman <bruggie@home.nl>
0003 SPDX-FileCopyrightText: 2011 Dmitry Risenberg <dmitry.risenberg@gmail.com>
0004 
0005 SPDX-License-Identifier: GPL-2.0-or-later
0006 */
0007 
0008 #ifndef KOMPAREDIFF2_LEVENSHTEINTABLE_H
0009 #define KOMPAREDIFF2_LEVENSHTEINTABLE_H
0010 
0011 #include <iostream>
0012 // #include <QString>
0013 // #include <komparediffdebug.h>
0014 
0015 namespace Diff2 {
0016 
0017 class Marker;
0018 
0019 
0020 class Marker;
0021 
0022 /**
0023  * Computes the Levenshtein distance between two sequences.
0024  * The actual sequence contents must be prepended with one virtual item each for easier index access.
0025  */
0026 template<class SequencePair> class LevenshteinTable
0027 {
0028 public:
0029     LevenshteinTable();
0030     LevenshteinTable(unsigned int width, unsigned int height);
0031     ~LevenshteinTable();
0032 
0033 public:
0034     int  getContent(unsigned int posX, unsigned int posY) const;
0035     int  setContent(unsigned int posX, unsigned int posY, int value);
0036     bool setSize(unsigned int width, unsigned int height);
0037 
0038     unsigned int width()  const { return m_width; };
0039     unsigned int height() const { return m_height; };
0040 
0041     /** Debug method  to check if the table is properly filled */
0042     void dumpLevenshteinTable(void);
0043 
0044     /**
0045      * This calculates the levenshtein distance of 2 sequences.
0046      * This object takes ownership of the argument
0047      */
0048     unsigned int createTable(SequencePair* sequences);
0049 
0050     void createListsOfMarkers(void);
0051     int chooseRoute(int c1, int c2, int c3, int current);
0052 
0053 protected:
0054     LevenshteinTable(const LevenshteinTable& table);
0055     const LevenshteinTable& operator = (const LevenshteinTable& table);
0056 
0057 private:
0058     unsigned int      m_width;
0059     unsigned int      m_height;
0060     unsigned int      m_size;
0061     unsigned int*     m_table;
0062     SequencePair*     m_sequences;
0063 };
0064 
0065 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable()
0066     : m_width(256),
0067       m_height(256),
0068       m_size(m_height* m_width),
0069       m_table(new unsigned int[ m_size ]),
0070       m_sequences(nullptr)
0071 {
0072 }
0073 
0074 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height)
0075     : m_width(width),
0076       m_height(height),
0077       m_size(m_width* m_height),
0078       m_table(new unsigned int[ m_size ]),
0079       m_sequences(0)
0080 {
0081 }
0082 
0083 template<class SequencePair> LevenshteinTable<SequencePair>::~LevenshteinTable()
0084 {
0085     delete[] m_table;
0086     delete m_sequences;
0087 }
0088 
0089 template<class SequencePair> int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const
0090 {
0091 //     qCDebug(LIBKOMPAREDIFF2) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY;
0092     return m_table[ posY * m_width + posX ];
0093 }
0094 
0095 template<class SequencePair> int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value)
0096 {
0097     m_table[ posY * m_width + posX ] = value;
0098 
0099     return 0;
0100 }
0101 
0102 template<class SequencePair> bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height)
0103 {
0104     // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty
0105     if (((width) * (height)) > (256 * 256 * 256))
0106         return false;
0107 
0108     if (((width) * (height)) > m_size)
0109     {
0110         delete[] m_table;
0111 
0112         m_size = width * height;
0113         m_table = new unsigned int[ m_size ];
0114     }
0115 
0116     m_width = width;
0117     m_height = height;
0118 
0119     return true;
0120 }
0121 
0122 template<class SequencePair> void LevenshteinTable<SequencePair>::dumpLevenshteinTable()
0123 {
0124     for (unsigned int i = 0; i < m_height; ++i)
0125     {
0126         for (unsigned int j = 0; j < m_width; ++j)
0127         {
0128             std::cout.width(3);
0129             std::cout << getContent(j, i);
0130         }
0131         std::cout << std::endl;
0132     }
0133 }
0134 
0135 template<class SequencePair> unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair* sequences)
0136 {
0137     m_sequences = sequences;
0138     unsigned int m = m_sequences->lengthFirst();
0139     unsigned int n = m_sequences->lengthSecond();
0140 
0141     if (!setSize(m, n))
0142         return 0;
0143 
0144     unsigned int i;
0145     unsigned int j;
0146 
0147     // initialize first row
0148     for (i = 0; i < m; ++i)
0149         setContent(i, 0, i);
0150     // initialize first column
0151     for (j = 0; j < n; ++j)
0152         setContent(0, j, j);
0153 
0154     int cost = 0, north = 0, west = 0, northwest = 0;
0155 
0156     // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings
0157     for (j = 1; j < n; ++j)
0158     {
0159         for (i = 1; i < m; ++i)
0160         {
0161             if (m_sequences->equal(i, j))
0162                 cost = 0;
0163             else
0164                 cost = SequencePair::allowReplace ? 1 : 2;
0165 
0166             north     = getContent(i, j - 1) + 1;
0167             west      = getContent(i - 1, j) + 1;
0168             northwest = getContent(i - 1, j - 1) + cost;
0169 
0170             setContent(i, j, qMin(north, qMin(west, northwest)));
0171         }
0172     }
0173 
0174     return getContent(m - 1, n - 1);
0175 }
0176 
0177 template<class SequencePair> int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current)
0178 {
0179 //     qCDebug(LIBKOMPAREDIFF2) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3;
0180     // preference order: c2, c3, c1, hopefully this will work out for me
0181     if (c2 <= c1 && c2 <= c3)
0182     {
0183         if (SequencePair::allowReplace || (c2 == current))
0184         {
0185             return 1;
0186         }
0187     }
0188 
0189     if (c3 <= c2 && c3 <= c1)
0190         return 2;
0191 
0192     return 0;
0193 }
0194 
0195 template<class SequencePair> void LevenshteinTable<SequencePair>::createListsOfMarkers()
0196 {
0197 //     qCDebug(LIBKOMPAREDIFF2) << source;
0198 //     qCDebug(LIBKOMPAREDIFF2) << destination;
0199 //     dumpLevenshteinTable();
0200 
0201     unsigned int x = m_width - 1;
0202     unsigned int y = m_height - 1;
0203 
0204     unsigned int difference = getContent(x, y);
0205 
0206     // If the number of differences is more than half the length of the largest string
0207     // don't bother to mark the individual changes
0208     // Patch based on work by Felix Berger as put as attachment to bug 75794
0209     if (!m_sequences->needFineGrainedOutput(difference))
0210     {
0211         m_sequences->prependFirst(new Marker(Marker::End, x));
0212         m_sequences->prependFirst(new Marker(Marker::Start, 0));
0213         m_sequences->prependSecond(new Marker(Marker::End, y));
0214         m_sequences->prependSecond(new Marker(Marker::Start, 0));
0215         return;
0216     }
0217 
0218     Marker* c = nullptr;
0219 
0220     int n, nw, w, direction, currentValue;
0221     while (x > 0 && y > 0)
0222     {
0223         currentValue = getContent(x, y);
0224 
0225         n  = getContent(x, y - 1);
0226         w  = getContent(x - 1, y);
0227         nw = getContent(x - 1, y - 1);
0228         direction = chooseRoute(n, nw, w, currentValue);
0229 
0230         switch (direction)
0231         {
0232         case 0: // north
0233 //             qCDebug(LIBKOMPAREDIFF2) << "Picking north";
0234 //             qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
0235 
0236             if (!m_sequences->markerListSecond().isEmpty())
0237                 c = m_sequences->markerListSecond().first();
0238             else
0239                 c = nullptr;
0240 
0241             if (c && c->type() == Marker::End)
0242             {
0243 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
0244                 if (n == currentValue)
0245                     m_sequences->prependSecond(new Marker(Marker::Start, y));
0246 // else: the change continues, do not do anything
0247             }
0248             else
0249             {
0250 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
0251                 if (n < currentValue)
0252                     m_sequences->prependSecond(new Marker(Marker::End, y));
0253             }
0254 
0255             --y;
0256             if (y == 0) {
0257                 m_sequences->prependSecond(new Marker(Marker::Start, 0));
0258             }
0259             break;
0260         case 1: // northwest
0261 //             qCDebug(LIBKOMPAREDIFF2) << "Picking northwest";
0262 //             qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
0263 
0264             if (!m_sequences->markerListSecond().isEmpty())
0265                 c = m_sequences->markerListSecond().first();
0266             else
0267                 c = nullptr;
0268 
0269             if (c && c->type() == Marker::End)
0270             {
0271 //                 qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
0272                 if (nw == currentValue)
0273                     m_sequences->prependSecond(new Marker(Marker::Start, y));
0274                 // else: the change continues, do not do anything
0275             }
0276             else
0277             {
0278 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
0279                 if (nw < currentValue)
0280                     m_sequences->prependSecond(new Marker(Marker::End, y));
0281             }
0282 
0283             if (!m_sequences->markerListFirst().isEmpty())
0284                 c = m_sequences->markerListFirst().first();
0285             else
0286                 c = nullptr;
0287 
0288             if (c && c->type() == Marker::End)
0289             {
0290 //                 qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
0291                 if (nw == currentValue)
0292                     m_sequences->prependFirst(new Marker(Marker::Start, x));
0293                 // else: the change continues, do not do anything
0294             }
0295             else
0296             {
0297 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
0298                 if (nw < currentValue)
0299                     m_sequences->prependFirst(new Marker(Marker::End, x));
0300             }
0301 
0302             --y;
0303             --x;
0304             break;
0305         case 2: // west
0306 //             qCDebug(LIBKOMPAREDIFF2) << "Picking west";
0307 //             qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] );
0308 
0309             if (!m_sequences->markerListFirst().isEmpty())
0310                 c = m_sequences->markerListFirst().first();
0311             else
0312                 c = nullptr;
0313 
0314             if (c && c->type() == Marker::End)
0315             {
0316 //                 qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue;
0317                 if (w == currentValue)
0318                     m_sequences->prependFirst(new Marker(Marker::Start, x));
0319                 // else: the change continues, do not do anything
0320             }
0321             else
0322             {
0323 //                 qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue;
0324                 if (w < currentValue)
0325                     m_sequences->prependFirst(new Marker(Marker::End, x));
0326             }
0327 
0328             --x;
0329             if (x == 0) {
0330                 m_sequences->prependFirst(new Marker(Marker::Start, 0));
0331             }
0332             break;
0333         }
0334     }
0335 
0336     // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now.
0337     if (x != 0)
0338     {
0339         m_sequences->prependFirst(new Marker(Marker::End, x));
0340         m_sequences->prependFirst(new Marker(Marker::Start, 0));
0341     }
0342 
0343     if (y != 0)
0344     {
0345         m_sequences->prependSecond(new Marker(Marker::End, y));
0346         m_sequences->prependSecond(new Marker(Marker::Start, 0));
0347     }
0348 
0349 //     qCDebug(LIBKOMPAREDIFF2) << "Source string: " << source;
0350 
0351 //     QStringList list;
0352 //     int prevValue = 0;
0353 //     MarkerListConstIterator mit = m_sequences->markerListFirst().begin();
0354 //     MarkerListConstIterator end = m_sequences->markerListFirst().end();
0355 //     for ( ; mit != end; ++mit )
0356 //     {
0357 //         c = *mit;
0358 //         qCDebug(LIBKOMPAREDIFF2) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
0359 //         list.append( source.mid( prevValue, c->offset() - prevValue ) );
0360 //         prevValue = c->offset();
0361 //     }
0362 //     if ( prevValue < source.length() - 1 )
0363 //     {
0364 //         list.append( source.mid( prevValue, source.length() - prevValue ) );
0365 //     }
0366 //     qCDebug(LIBKOMPAREDIFF2) << "Source Resulting stringlist : " << list.join("\n");
0367 //
0368 //     list.clear();
0369 //     prevValue = 0;
0370 //
0371 //     qCDebug(LIBKOMPAREDIFF2) << "Destination string: " << destination;
0372 //     mit = m_sequences->markerListSecond().begin();
0373 //     end = m_sequences->markerListSecond().end();
0374 //     for ( ; mit != end; ++mit )
0375 //     {
0376 //         c = *mit;
0377 //         qCDebug(LIBKOMPAREDIFF2) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset();
0378 //         list.append( destination.mid( prevValue, c->offset() - prevValue ) );
0379 //         prevValue = c->offset();
0380 //     }
0381 //     if ( prevValue < destination.length() - 1 )
0382 //     {
0383 //         list.append( destination.mid( prevValue, destination.length() - prevValue ) );
0384 //     }
0385 //     qCDebug(LIBKOMPAREDIFF2) << "Destination Resulting string : " << list.join("\n");
0386 }
0387 
0388 } // namespace Diff2
0389 
0390 #endif // KOMPAREDIFF2_LEVENSHTEINTABLE_H