File indexing completed on 2024-04-28 09:39:20
0001 /* 0002 SPDX-FileCopyrightText: 2003 Otto Bruggeman <bruggie@home.nl> 0003 SPDX-FileCopyrightText: 2011 Dmitry Risenberg <dmitry.risenberg@gmail.com> 0004 0005 SPDX-License-Identifier: GPL-2.0-or-later 0006 */ 0007 0008 #ifndef KOMPAREDIFF2_LEVENSHTEINTABLE_H 0009 #define KOMPAREDIFF2_LEVENSHTEINTABLE_H 0010 0011 #include <iostream> 0012 // #include <QString> 0013 // #include <komparediffdebug.h> 0014 0015 namespace Diff2 { 0016 0017 class Marker; 0018 0019 0020 class Marker; 0021 0022 /** 0023 * Computes the Levenshtein distance between two sequences. 0024 * The actual sequence contents must be prepended with one virtual item each for easier index access. 0025 */ 0026 template<class SequencePair> class LevenshteinTable 0027 { 0028 public: 0029 LevenshteinTable(); 0030 LevenshteinTable(unsigned int width, unsigned int height); 0031 ~LevenshteinTable(); 0032 0033 public: 0034 int getContent(unsigned int posX, unsigned int posY) const; 0035 int setContent(unsigned int posX, unsigned int posY, int value); 0036 bool setSize(unsigned int width, unsigned int height); 0037 0038 unsigned int width() const { return m_width; }; 0039 unsigned int height() const { return m_height; }; 0040 0041 /** Debug method to check if the table is properly filled */ 0042 void dumpLevenshteinTable(void); 0043 0044 /** 0045 * This calculates the levenshtein distance of 2 sequences. 0046 * This object takes ownership of the argument 0047 */ 0048 unsigned int createTable(SequencePair* sequences); 0049 0050 void createListsOfMarkers(void); 0051 int chooseRoute(int c1, int c2, int c3, int current); 0052 0053 protected: 0054 LevenshteinTable(const LevenshteinTable& table); 0055 const LevenshteinTable& operator = (const LevenshteinTable& table); 0056 0057 private: 0058 unsigned int m_width; 0059 unsigned int m_height; 0060 unsigned int m_size; 0061 unsigned int* m_table; 0062 SequencePair* m_sequences; 0063 }; 0064 0065 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable() 0066 : m_width(256), 0067 m_height(256), 0068 m_size(m_height* m_width), 0069 m_table(new unsigned int[ m_size ]), 0070 m_sequences(nullptr) 0071 { 0072 } 0073 0074 template<class SequencePair> LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height) 0075 : m_width(width), 0076 m_height(height), 0077 m_size(m_width* m_height), 0078 m_table(new unsigned int[ m_size ]), 0079 m_sequences(0) 0080 { 0081 } 0082 0083 template<class SequencePair> LevenshteinTable<SequencePair>::~LevenshteinTable() 0084 { 0085 delete[] m_table; 0086 delete m_sequences; 0087 } 0088 0089 template<class SequencePair> int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const 0090 { 0091 // qCDebug(LIBKOMPAREDIFF2) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY; 0092 return m_table[ posY * m_width + posX ]; 0093 } 0094 0095 template<class SequencePair> int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value) 0096 { 0097 m_table[ posY * m_width + posX ] = value; 0098 0099 return 0; 0100 } 0101 0102 template<class SequencePair> bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height) 0103 { 0104 // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty 0105 if (((width) * (height)) > (256 * 256 * 256)) 0106 return false; 0107 0108 if (((width) * (height)) > m_size) 0109 { 0110 delete[] m_table; 0111 0112 m_size = width * height; 0113 m_table = new unsigned int[ m_size ]; 0114 } 0115 0116 m_width = width; 0117 m_height = height; 0118 0119 return true; 0120 } 0121 0122 template<class SequencePair> void LevenshteinTable<SequencePair>::dumpLevenshteinTable() 0123 { 0124 for (unsigned int i = 0; i < m_height; ++i) 0125 { 0126 for (unsigned int j = 0; j < m_width; ++j) 0127 { 0128 std::cout.width(3); 0129 std::cout << getContent(j, i); 0130 } 0131 std::cout << std::endl; 0132 } 0133 } 0134 0135 template<class SequencePair> unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair* sequences) 0136 { 0137 m_sequences = sequences; 0138 unsigned int m = m_sequences->lengthFirst(); 0139 unsigned int n = m_sequences->lengthSecond(); 0140 0141 if (!setSize(m, n)) 0142 return 0; 0143 0144 unsigned int i; 0145 unsigned int j; 0146 0147 // initialize first row 0148 for (i = 0; i < m; ++i) 0149 setContent(i, 0, i); 0150 // initialize first column 0151 for (j = 0; j < n; ++j) 0152 setContent(0, j, j); 0153 0154 int cost = 0, north = 0, west = 0, northwest = 0; 0155 0156 // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings 0157 for (j = 1; j < n; ++j) 0158 { 0159 for (i = 1; i < m; ++i) 0160 { 0161 if (m_sequences->equal(i, j)) 0162 cost = 0; 0163 else 0164 cost = SequencePair::allowReplace ? 1 : 2; 0165 0166 north = getContent(i, j - 1) + 1; 0167 west = getContent(i - 1, j) + 1; 0168 northwest = getContent(i - 1, j - 1) + cost; 0169 0170 setContent(i, j, qMin(north, qMin(west, northwest))); 0171 } 0172 } 0173 0174 return getContent(m - 1, n - 1); 0175 } 0176 0177 template<class SequencePair> int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current) 0178 { 0179 // qCDebug(LIBKOMPAREDIFF2) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3; 0180 // preference order: c2, c3, c1, hopefully this will work out for me 0181 if (c2 <= c1 && c2 <= c3) 0182 { 0183 if (SequencePair::allowReplace || (c2 == current)) 0184 { 0185 return 1; 0186 } 0187 } 0188 0189 if (c3 <= c2 && c3 <= c1) 0190 return 2; 0191 0192 return 0; 0193 } 0194 0195 template<class SequencePair> void LevenshteinTable<SequencePair>::createListsOfMarkers() 0196 { 0197 // qCDebug(LIBKOMPAREDIFF2) << source; 0198 // qCDebug(LIBKOMPAREDIFF2) << destination; 0199 // dumpLevenshteinTable(); 0200 0201 unsigned int x = m_width - 1; 0202 unsigned int y = m_height - 1; 0203 0204 unsigned int difference = getContent(x, y); 0205 0206 // If the number of differences is more than half the length of the largest string 0207 // don't bother to mark the individual changes 0208 // Patch based on work by Felix Berger as put as attachment to bug 75794 0209 if (!m_sequences->needFineGrainedOutput(difference)) 0210 { 0211 m_sequences->prependFirst(new Marker(Marker::End, x)); 0212 m_sequences->prependFirst(new Marker(Marker::Start, 0)); 0213 m_sequences->prependSecond(new Marker(Marker::End, y)); 0214 m_sequences->prependSecond(new Marker(Marker::Start, 0)); 0215 return; 0216 } 0217 0218 Marker* c = nullptr; 0219 0220 int n, nw, w, direction, currentValue; 0221 while (x > 0 && y > 0) 0222 { 0223 currentValue = getContent(x, y); 0224 0225 n = getContent(x, y - 1); 0226 w = getContent(x - 1, y); 0227 nw = getContent(x - 1, y - 1); 0228 direction = chooseRoute(n, nw, w, currentValue); 0229 0230 switch (direction) 0231 { 0232 case 0: // north 0233 // qCDebug(LIBKOMPAREDIFF2) << "Picking north"; 0234 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] ); 0235 0236 if (!m_sequences->markerListSecond().isEmpty()) 0237 c = m_sequences->markerListSecond().first(); 0238 else 0239 c = nullptr; 0240 0241 if (c && c->type() == Marker::End) 0242 { 0243 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue; 0244 if (n == currentValue) 0245 m_sequences->prependSecond(new Marker(Marker::Start, y)); 0246 // else: the change continues, do not do anything 0247 } 0248 else 0249 { 0250 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue; 0251 if (n < currentValue) 0252 m_sequences->prependSecond(new Marker(Marker::End, y)); 0253 } 0254 0255 --y; 0256 if (y == 0) { 0257 m_sequences->prependSecond(new Marker(Marker::Start, 0)); 0258 } 0259 break; 0260 case 1: // northwest 0261 // qCDebug(LIBKOMPAREDIFF2) << "Picking northwest"; 0262 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] ); 0263 0264 if (!m_sequences->markerListSecond().isEmpty()) 0265 c = m_sequences->markerListSecond().first(); 0266 else 0267 c = nullptr; 0268 0269 if (c && c->type() == Marker::End) 0270 { 0271 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue; 0272 if (nw == currentValue) 0273 m_sequences->prependSecond(new Marker(Marker::Start, y)); 0274 // else: the change continues, do not do anything 0275 } 0276 else 0277 { 0278 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue; 0279 if (nw < currentValue) 0280 m_sequences->prependSecond(new Marker(Marker::End, y)); 0281 } 0282 0283 if (!m_sequences->markerListFirst().isEmpty()) 0284 c = m_sequences->markerListFirst().first(); 0285 else 0286 c = nullptr; 0287 0288 if (c && c->type() == Marker::End) 0289 { 0290 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue; 0291 if (nw == currentValue) 0292 m_sequences->prependFirst(new Marker(Marker::Start, x)); 0293 // else: the change continues, do not do anything 0294 } 0295 else 0296 { 0297 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue; 0298 if (nw < currentValue) 0299 m_sequences->prependFirst(new Marker(Marker::End, x)); 0300 } 0301 0302 --y; 0303 --x; 0304 break; 0305 case 2: // west 0306 // qCDebug(LIBKOMPAREDIFF2) << "Picking west"; 0307 // qCDebug(LIBKOMPAREDIFF2) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] ); 0308 0309 if (!m_sequences->markerListFirst().isEmpty()) 0310 c = m_sequences->markerListFirst().first(); 0311 else 0312 c = nullptr; 0313 0314 if (c && c->type() == Marker::End) 0315 { 0316 // qCDebug(LIBKOMPAREDIFF2) << "End found: CurrentValue: " << currentValue; 0317 if (w == currentValue) 0318 m_sequences->prependFirst(new Marker(Marker::Start, x)); 0319 // else: the change continues, do not do anything 0320 } 0321 else 0322 { 0323 // qCDebug(LIBKOMPAREDIFF2) << "CurrentValue: " << currentValue; 0324 if (w < currentValue) 0325 m_sequences->prependFirst(new Marker(Marker::End, x)); 0326 } 0327 0328 --x; 0329 if (x == 0) { 0330 m_sequences->prependFirst(new Marker(Marker::Start, 0)); 0331 } 0332 break; 0333 } 0334 } 0335 0336 // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now. 0337 if (x != 0) 0338 { 0339 m_sequences->prependFirst(new Marker(Marker::End, x)); 0340 m_sequences->prependFirst(new Marker(Marker::Start, 0)); 0341 } 0342 0343 if (y != 0) 0344 { 0345 m_sequences->prependSecond(new Marker(Marker::End, y)); 0346 m_sequences->prependSecond(new Marker(Marker::Start, 0)); 0347 } 0348 0349 // qCDebug(LIBKOMPAREDIFF2) << "Source string: " << source; 0350 0351 // QStringList list; 0352 // int prevValue = 0; 0353 // MarkerListConstIterator mit = m_sequences->markerListFirst().begin(); 0354 // MarkerListConstIterator end = m_sequences->markerListFirst().end(); 0355 // for ( ; mit != end; ++mit ) 0356 // { 0357 // c = *mit; 0358 // qCDebug(LIBKOMPAREDIFF2) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset(); 0359 // list.append( source.mid( prevValue, c->offset() - prevValue ) ); 0360 // prevValue = c->offset(); 0361 // } 0362 // if ( prevValue < source.length() - 1 ) 0363 // { 0364 // list.append( source.mid( prevValue, source.length() - prevValue ) ); 0365 // } 0366 // qCDebug(LIBKOMPAREDIFF2) << "Source Resulting stringlist : " << list.join("\n"); 0367 // 0368 // list.clear(); 0369 // prevValue = 0; 0370 // 0371 // qCDebug(LIBKOMPAREDIFF2) << "Destination string: " << destination; 0372 // mit = m_sequences->markerListSecond().begin(); 0373 // end = m_sequences->markerListSecond().end(); 0374 // for ( ; mit != end; ++mit ) 0375 // { 0376 // c = *mit; 0377 // qCDebug(LIBKOMPAREDIFF2) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset(); 0378 // list.append( destination.mid( prevValue, c->offset() - prevValue ) ); 0379 // prevValue = c->offset(); 0380 // } 0381 // if ( prevValue < destination.length() - 1 ) 0382 // { 0383 // list.append( destination.mid( prevValue, destination.length() - prevValue ) ); 0384 // } 0385 // qCDebug(LIBKOMPAREDIFF2) << "Destination Resulting string : " << list.join("\n"); 0386 } 0387 0388 } // namespace Diff2 0389 0390 #endif // KOMPAREDIFF2_LEVENSHTEINTABLE_H