File indexing completed on 2024-04-21 05:44:09
0001 /* 0002 SPDX-FileCopyrightText: 2003 Otto Bruggeman <bruggie@home.nl> 0003 SPDX-FileCopyrightText: 2011 Dmitry Risenberg <dmitry.risenberg@gmail.com> 0004 0005 SPDX-License-Identifier: GPL-2.0-or-later 0006 */ 0007 0008 #ifndef KOMPAREDIFF2_LEVENSHTEINTABLE_H 0009 #define KOMPAREDIFF2_LEVENSHTEINTABLE_H 0010 0011 // #include <QString> 0012 // #include <komparediff2_logging.h> 0013 // Std 0014 #include <iostream> 0015 #include <memory> 0016 #include <vector> 0017 0018 namespace KompareDiff2 0019 { 0020 0021 class Marker; 0022 0023 class Marker; 0024 0025 /** 0026 * Computes the Levenshtein distance between two sequences. 0027 * The actual sequence contents must be prepended with one virtual item each for easier index access. 0028 */ 0029 template<class SequencePair> 0030 class LevenshteinTable 0031 { 0032 public: 0033 LevenshteinTable(); 0034 LevenshteinTable(unsigned int width, unsigned int height); 0035 ~LevenshteinTable() = default; 0036 0037 LevenshteinTable(const LevenshteinTable &table) = delete; 0038 const LevenshteinTable &operator=(const LevenshteinTable &table) = delete; 0039 0040 public: 0041 int getContent(unsigned int posX, unsigned int posY) const; 0042 int setContent(unsigned int posX, unsigned int posY, int value); 0043 bool setSize(unsigned int width, unsigned int height); 0044 0045 unsigned int width() const 0046 { 0047 return m_width; 0048 }; 0049 unsigned int height() const 0050 { 0051 return m_height; 0052 }; 0053 0054 /** Debug method to check if the table is properly filled */ 0055 void dumpLevenshteinTable(); 0056 0057 /** 0058 * This calculates the levenshtein distance of 2 sequences. 0059 * This object takes ownership of the argument 0060 */ 0061 unsigned int createTable(SequencePair *sequences); 0062 0063 void createListsOfMarkers(); 0064 int chooseRoute(int c1, int c2, int c3, int current); 0065 0066 private: 0067 unsigned int m_width = 256; 0068 unsigned int m_height = 256; 0069 unsigned int m_size; 0070 std::vector<unsigned int> m_table; 0071 std::unique_ptr<SequencePair> m_sequences; 0072 }; 0073 0074 template<class SequencePair> 0075 LevenshteinTable<SequencePair>::LevenshteinTable() 0076 : m_size(m_height * m_width) 0077 , m_table(m_size) 0078 { 0079 } 0080 0081 template<class SequencePair> 0082 LevenshteinTable<SequencePair>::LevenshteinTable(unsigned int width, unsigned int height) 0083 : m_width(width) 0084 , m_height(height) 0085 , m_size(m_width * m_height) 0086 , m_table(m_size) 0087 { 0088 } 0089 0090 template<class SequencePair> 0091 int LevenshteinTable<SequencePair>::getContent(unsigned int posX, unsigned int posY) const 0092 { 0093 // qCDebug(KOMPAREDIFF2_LOG) << "Width = " << m_width << ", height = " << m_height << ", posX = " << posX << ", posY = " << posY; 0094 return m_table[posY * m_width + posX]; 0095 } 0096 0097 template<class SequencePair> 0098 int LevenshteinTable<SequencePair>::setContent(unsigned int posX, unsigned int posY, int value) 0099 { 0100 m_table[posY * m_width + posX] = value; 0101 0102 return 0; 0103 } 0104 0105 template<class SequencePair> 0106 bool LevenshteinTable<SequencePair>::setSize(unsigned int width, unsigned int height) 0107 { 0108 // Set a limit of 16.7 million entries, will be about 64 MB of ram, that should be plenty 0109 if (((width) * (height)) > (256 * 256 * 256)) 0110 return false; 0111 0112 if (((width) * (height)) > m_size) { 0113 m_size = width * height; 0114 m_table.resize(m_size); 0115 } 0116 0117 m_width = width; 0118 m_height = height; 0119 0120 return true; 0121 } 0122 0123 template<class SequencePair> 0124 void LevenshteinTable<SequencePair>::dumpLevenshteinTable() 0125 { 0126 for (unsigned int i = 0; i < m_height; ++i) { 0127 for (unsigned int j = 0; j < m_width; ++j) { 0128 std::cout.width(3); 0129 std::cout << getContent(j, i); 0130 } 0131 std::cout << std::endl; 0132 } 0133 } 0134 0135 template<class SequencePair> 0136 unsigned int LevenshteinTable<SequencePair>::createTable(SequencePair *sequences) 0137 { 0138 m_sequences.reset(sequences); 0139 unsigned int m = m_sequences->lengthFirst(); 0140 unsigned int n = m_sequences->lengthSecond(); 0141 0142 if (!setSize(m, n)) 0143 return 0; 0144 0145 unsigned int i; 0146 unsigned int j; 0147 0148 // initialize first row 0149 for (i = 0; i < m; ++i) 0150 setContent(i, 0, i); 0151 // initialize first column 0152 for (j = 0; j < n; ++j) 0153 setContent(0, j, j); 0154 0155 int cost = 0, north = 0, west = 0, northwest = 0; 0156 0157 // Optimization, calculate row wise instead of column wise, wont trash the cache so much with large strings 0158 for (j = 1; j < n; ++j) { 0159 for (i = 1; i < m; ++i) { 0160 if (m_sequences->equal(i, j)) 0161 cost = 0; 0162 else 0163 cost = SequencePair::allowReplace ? 1 : 2; 0164 0165 north = getContent(i, j - 1) + 1; 0166 west = getContent(i - 1, j) + 1; 0167 northwest = getContent(i - 1, j - 1) + cost; 0168 0169 setContent(i, j, qMin(north, qMin(west, northwest))); 0170 } 0171 } 0172 0173 return getContent(m - 1, n - 1); 0174 } 0175 0176 template<class SequencePair> 0177 int LevenshteinTable<SequencePair>::chooseRoute(int c1, int c2, int c3, int current) 0178 { 0179 // qCDebug(KOMPAREDIFF2_LOG) << "c1 = " << c1 << ", c2 = " << c2 << ", c3 = " << c3; 0180 // preference order: c2, c3, c1, hopefully this will work out for me 0181 if (c2 <= c1 && c2 <= c3) { 0182 if (SequencePair::allowReplace || (c2 == current)) { 0183 return 1; 0184 } 0185 } 0186 0187 if (c3 <= c2 && c3 <= c1) 0188 return 2; 0189 0190 return 0; 0191 } 0192 0193 template<class SequencePair> 0194 void LevenshteinTable<SequencePair>::createListsOfMarkers() 0195 { 0196 // qCDebug(KOMPAREDIFF2_LOG) << source; 0197 // qCDebug(KOMPAREDIFF2_LOG) << destination; 0198 // dumpLevenshteinTable(); 0199 0200 unsigned int x = m_width - 1; 0201 unsigned int y = m_height - 1; 0202 0203 unsigned int difference = getContent(x, y); 0204 0205 // If the number of differences is more than half the length of the largest string 0206 // don't bother to mark the individual changes 0207 // Patch based on work by Felix Berger as put as attachment to bug 75794 0208 if (!m_sequences->needFineGrainedOutput(difference)) { 0209 m_sequences->prependFirst(new Marker(Marker::End, x)); 0210 m_sequences->prependFirst(new Marker(Marker::Start, 0)); 0211 m_sequences->prependSecond(new Marker(Marker::End, y)); 0212 m_sequences->prependSecond(new Marker(Marker::Start, 0)); 0213 return; 0214 } 0215 0216 Marker *c = nullptr; 0217 0218 int n, nw, w, direction, currentValue; 0219 while (x > 0 && y > 0) { 0220 currentValue = getContent(x, y); 0221 0222 n = getContent(x, y - 1); 0223 w = getContent(x - 1, y); 0224 nw = getContent(x - 1, y - 1); 0225 direction = chooseRoute(n, nw, w, currentValue); 0226 0227 switch (direction) { 0228 case 0: // north 0229 // qCDebug(KOMPAREDIFF2_LOG) << "Picking north"; 0230 // qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] ); 0231 0232 if (!m_sequences->markerListSecond().isEmpty()) 0233 c = m_sequences->markerListSecond().first(); 0234 else 0235 c = nullptr; 0236 0237 if (c && c->type() == Marker::End) { 0238 // qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue; 0239 if (n == currentValue) 0240 m_sequences->prependSecond(new Marker(Marker::Start, y)); 0241 // else: the change continues, do not do anything 0242 } else { 0243 // qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue; 0244 if (n < currentValue) 0245 m_sequences->prependSecond(new Marker(Marker::End, y)); 0246 } 0247 0248 --y; 0249 if (y == 0) { 0250 m_sequences->prependSecond(new Marker(Marker::Start, 0)); 0251 } 0252 break; 0253 case 1: // northwest 0254 // qCDebug(KOMPAREDIFF2_LOG) << "Picking northwest"; 0255 // qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] ); 0256 0257 if (!m_sequences->markerListSecond().isEmpty()) 0258 c = m_sequences->markerListSecond().first(); 0259 else 0260 c = nullptr; 0261 0262 if (c && c->type() == Marker::End) { 0263 // qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue; 0264 if (nw == currentValue) 0265 m_sequences->prependSecond(new Marker(Marker::Start, y)); 0266 // else: the change continues, do not do anything 0267 } else { 0268 // qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue; 0269 if (nw < currentValue) 0270 m_sequences->prependSecond(new Marker(Marker::End, y)); 0271 } 0272 0273 if (!m_sequences->markerListFirst().isEmpty()) 0274 c = m_sequences->markerListFirst().first(); 0275 else 0276 c = nullptr; 0277 0278 if (c && c->type() == Marker::End) { 0279 // qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue; 0280 if (nw == currentValue) 0281 m_sequences->prependFirst(new Marker(Marker::Start, x)); 0282 // else: the change continues, do not do anything 0283 } else { 0284 // qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue; 0285 if (nw < currentValue) 0286 m_sequences->prependFirst(new Marker(Marker::End, x)); 0287 } 0288 0289 --y; 0290 --x; 0291 break; 0292 case 2: // west 0293 // qCDebug(KOMPAREDIFF2_LOG) << "Picking west"; 0294 // qCDebug(KOMPAREDIFF2_LOG) << "Source[" << ( x - 1 ) << "] = " << QString( source[ x-1 ] ) << ", destination[" << ( y - 1 ) << "] = " << QString( destination[ y-1 ] ); 0295 0296 if (!m_sequences->markerListFirst().isEmpty()) 0297 c = m_sequences->markerListFirst().first(); 0298 else 0299 c = nullptr; 0300 0301 if (c && c->type() == Marker::End) { 0302 // qCDebug(KOMPAREDIFF2_LOG) << "End found: CurrentValue: " << currentValue; 0303 if (w == currentValue) 0304 m_sequences->prependFirst(new Marker(Marker::Start, x)); 0305 // else: the change continues, do not do anything 0306 } else { 0307 // qCDebug(KOMPAREDIFF2_LOG) << "CurrentValue: " << currentValue; 0308 if (w < currentValue) 0309 m_sequences->prependFirst(new Marker(Marker::End, x)); 0310 } 0311 0312 --x; 0313 if (x == 0) { 0314 m_sequences->prependFirst(new Marker(Marker::Start, 0)); 0315 } 0316 break; 0317 } 0318 } 0319 0320 // When leaving the loop it does not mean both are 0! If not there is still a change at the beginning of the line we missed so adding now. 0321 if (x != 0) { 0322 m_sequences->prependFirst(new Marker(Marker::End, x)); 0323 m_sequences->prependFirst(new Marker(Marker::Start, 0)); 0324 } 0325 0326 if (y != 0) { 0327 m_sequences->prependSecond(new Marker(Marker::End, y)); 0328 m_sequences->prependSecond(new Marker(Marker::Start, 0)); 0329 } 0330 0331 // qCDebug(KOMPAREDIFF2_LOG) << "Source string: " << source; 0332 0333 // QStringList list; 0334 // int prevValue = 0; 0335 // MarkerListConstIterator mit = m_sequences->markerListFirst().begin(); 0336 // MarkerListConstIterator end = m_sequences->markerListFirst().end(); 0337 // for ( ; mit != end; ++mit ) 0338 // { 0339 // c = *mit; 0340 // qCDebug(KOMPAREDIFF2_LOG) << "Source Marker Entry : Type: " << c->type() << ", Offset: " << c->offset(); 0341 // list.append( source.mid( prevValue, c->offset() - prevValue ) ); 0342 // prevValue = c->offset(); 0343 // } 0344 // if ( prevValue < source.length() - 1 ) 0345 // { 0346 // list.append( source.mid( prevValue, source.length() - prevValue ) ); 0347 // } 0348 // qCDebug(KOMPAREDIFF2_LOG) << "Source Resulting stringlist : " << list.join("\n"); 0349 // 0350 // list.clear(); 0351 // prevValue = 0; 0352 // 0353 // qCDebug(KOMPAREDIFF2_LOG) << "Destination string: " << destination; 0354 // mit = m_sequences->markerListSecond().begin(); 0355 // end = m_sequences->markerListSecond().end(); 0356 // for ( ; mit != end; ++mit ) 0357 // { 0358 // c = *mit; 0359 // qCDebug(KOMPAREDIFF2_LOG) << "Destination Marker Entry : Type: " << c->type() << ", Offset: " << c->offset(); 0360 // list.append( destination.mid( prevValue, c->offset() - prevValue ) ); 0361 // prevValue = c->offset(); 0362 // } 0363 // if ( prevValue < destination.length() - 1 ) 0364 // { 0365 // list.append( destination.mid( prevValue, destination.length() - prevValue ) ); 0366 // } 0367 // qCDebug(KOMPAREDIFF2_LOG) << "Destination Resulting string : " << list.join("\n"); 0368 } 0369 0370 } // namespace KompareDiff2 0371 0372 #endif // KOMPAREDIFF2_LEVENSHTEINTABLE_H