File indexing completed on 2024-12-29 04:51:07
0001 /* 0002 SPDX-FileCopyrightText: 2023 Volker Krause <vkrause@kde.org> 0003 SPDX-License-Identifier: LGPL-2.0-or-later 0004 */ 0005 0006 #include "pricefinder_p.h" 0007 0008 #include <KItinerary/PriceUtil> 0009 0010 #include <QDebug> 0011 #include <QLocale> 0012 #include <QRegularExpression> 0013 0014 #include <cmath> 0015 #include <cstring> 0016 0017 using namespace KItinerary; 0018 0019 std::vector<PriceFinder::CurrencyData> PriceFinder::s_currencyData; 0020 0021 // normalize currency symbols, as e.g. "wide Yen" and "normal Yen" should be considered the same 0022 static QString normalizeSymbol(QStringView str) 0023 { 0024 QString out; 0025 out.reserve(str.size()); 0026 for (const auto c : str) { 0027 if (c.decompositionTag() == QChar::Wide) { 0028 out.push_back(c.decomposition().at(0)); 0029 } else { 0030 out.push_back(c); 0031 } 0032 } 0033 return out; 0034 } 0035 0036 static bool isCollidingSymbol(QStringView lhs, QStringView rhs) 0037 { 0038 return lhs == rhs 0039 || (lhs.size() == rhs.size() + 1 && lhs.back() == QLatin1Char('.') && lhs.startsWith(rhs)) 0040 || (rhs.size() == lhs.size() + 1 && rhs.back() == QLatin1Char('.') && rhs.startsWith(lhs)); 0041 } 0042 0043 // overrides to QLocale data 0044 // ### keep sorted by ISO code 0045 struct { 0046 const char isoCode[4]; 0047 const char *symbol; 0048 } static constexpr const currency_data_overrides[] = { 0049 { "BAM", nullptr }, // BAM's symbol is "KM", which collides with distance values on train tickets too often 0050 { "GBP", "£" }, // FKP, GIP and SHP are practically GPB-equivalent using the pound sign, SSP has it wrongly assigned in QLocale 0051 { "JPY", "円"}, // the Yen sign is also used by CNY and thus ambigious, but the Japanese Yen symbol works 0052 }; 0053 0054 PriceFinder::PriceFinder() 0055 { 0056 if (!s_currencyData.empty()) { 0057 return; 0058 } 0059 0060 const auto allLocales = QLocale::matchingLocales(QLocale::AnyLanguage, QLocale::AnyScript, QLocale::AnyCountry); 0061 for (const auto &locale : allLocales) { 0062 CurrencyData data{locale.currencySymbol(QLocale::CurrencyIsoCode), normalizeSymbol(locale.currencySymbol(QLocale::CurrencySymbol))}; 0063 if (data.isoCode.isEmpty()) { 0064 continue; 0065 } 0066 0067 // single letter symbols tend to be way too trigger-happy 0068 if (data.symbol.size() == 1 && data.symbol[0].isLetter()) { 0069 //qDebug() << "Dropping single letter symbol:" << data.symbol << data.isoCode; 0070 data.symbol.clear(); 0071 } 0072 0073 s_currencyData.push_back(std::move(data)); 0074 } 0075 0076 // remove duplicates 0077 const auto lessThanCurrencyData = [](const auto &lhs, const auto &rhs) { 0078 return std::tie(lhs.isoCode, lhs.symbol) < std::tie(rhs.isoCode, rhs.symbol); 0079 }; 0080 std::sort(s_currencyData.begin(), s_currencyData.end(), lessThanCurrencyData); 0081 const auto compareCurrencyData = [](const auto &lhs, const auto &rhs) { 0082 return lhs.isoCode == rhs.isoCode && lhs.symbol == rhs.symbol; 0083 }; 0084 s_currencyData.erase(std::unique(s_currencyData.begin(), s_currencyData.end(), compareCurrencyData), s_currencyData.end()); 0085 0086 // clear ambigious symbols 0087 for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) { 0088 if ((*it).symbol.isEmpty()) { 0089 continue; 0090 } 0091 bool collision = false; 0092 for (auto it2 = std::next(it); it2 != s_currencyData.end(); ++it2) { 0093 if (!isCollidingSymbol((*it).symbol, (*it2).symbol)) { 0094 continue; 0095 } 0096 (*it2).symbol.clear(); 0097 if (!collision) { 0098 qDebug() << "Ambigious currency symbol:" << (*it).symbol; 0099 } 0100 collision = true; 0101 } 0102 if (collision) { 0103 (*it).symbol.clear(); 0104 } 0105 } 0106 0107 // apply our own overrides over QLocale 0108 for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) { 0109 const auto it2 = std::lower_bound(std::begin(currency_data_overrides), std::end(currency_data_overrides), (*it).isoCode, [](const auto &lhs, const auto &rhs) { 0110 return std::strncmp(lhs.isoCode, rhs.toLatin1().constData(), 3) < 0; 0111 }); 0112 if (it2 == std::end(currency_data_overrides) || std::strncmp((*it2).isoCode, (*it).isoCode.toLatin1().constData(), 3) != 0) { 0113 continue; 0114 } 0115 (*it).symbol = (*it2).symbol ? QString::fromUtf8((*it2).symbol) : QString(); 0116 } 0117 } 0118 0119 PriceFinder::~PriceFinder() = default; 0120 0121 static bool isBoundaryChar(QChar c) 0122 { 0123 return c != QLatin1Char('-') && (c.isSpace() || c.isPunct() || c.isSymbol()); 0124 } 0125 0126 void PriceFinder::findAll(QStringView text, std::vector<Result> &results) const 0127 { 0128 static QRegularExpression rx(QStringLiteral(R"((?<=\s|[[:punct:]]|^)([^\d\s]{1,4})?[ ]*(\d(?:[\d,. ]*\d)?)[ ]*([^\d\s]{1,4})?(?=\s|[[:punct:]]|$))")); 0129 0130 const auto prevResultSize = results.size(); 0131 qsizetype offset = 0; 0132 while (true) { 0133 const auto match = rx.matchView(text, offset); 0134 if (!match.hasMatch()) { 0135 break; 0136 } 0137 offset = match.capturedEnd(2); 0138 0139 const auto leadingCurrency = parseCurrency(match.capturedView(1), CurrencyPrefix); 0140 const auto trailingCurrency = parseCurrency(match.capturedView(3), CurrencySuffix); 0141 if ((leadingCurrency.isEmpty() && trailingCurrency.isEmpty()) || (!leadingCurrency.isEmpty() && !trailingCurrency.isEmpty() && leadingCurrency != trailingCurrency)) { 0142 continue; 0143 } 0144 0145 // additional boundary checks not covered by the regular expression 0146 if (leadingCurrency.isEmpty() && match.capturedStart(2) > 0 && !isBoundaryChar(text[match.capturedStart(2) - 1])) { 0147 continue; 0148 } 0149 if (trailingCurrency.isEmpty() && match.capturedEnd(2) < text.size() - 2 && !isBoundaryChar(text[match.capturedEnd(2)])) { 0150 continue; 0151 } 0152 0153 Result r; 0154 r.start = leadingCurrency.isEmpty() ? match.capturedStart(2) : match.capturedStart(); 0155 r.end = trailingCurrency.isEmpty() ? match.capturedEnd(2) : match.capturedEnd(); 0156 r.currency = leadingCurrency.isEmpty() ? trailingCurrency : leadingCurrency; 0157 0158 r.value = parseValue(match.capturedView(2), r.currency); 0159 if (std::isnan(r.value)) { 0160 continue; 0161 } 0162 0163 results.push_back(std::move(r)); 0164 } 0165 0166 // check for overlapping results: in those case we have to assume the entire result is invalid 0167 if (results.size() <= 1 + prevResultSize) { 0168 return; 0169 } 0170 for (auto it = results.begin() + prevResultSize; it != std::prev(results.end()); ++it) { 0171 if ((*it).end >= (*std::next(it)).start) { 0172 qDebug() << "overlapping price data, discarding result"; 0173 results.erase(results.begin() + prevResultSize, results.end()); 0174 return; 0175 } 0176 } 0177 } 0178 0179 PriceFinder::Result PriceFinder::findHighest(QStringView text) const 0180 { 0181 std::vector<Result> results; 0182 findAll(text, results); 0183 return highest(results); 0184 } 0185 0186 bool PriceFinder::isSingleCurrency(const std::vector<Result> &results) const 0187 { 0188 if (results.empty()) { 0189 return false; 0190 } 0191 0192 const auto isoCode = results.front().currency; 0193 return std::all_of(results.begin(), results.end(), [&isoCode](const auto &r) { return r.currency == isoCode; }); 0194 } 0195 0196 PriceFinder::Result PriceFinder::highest(const std::vector<Result> &results) const 0197 { 0198 if (!isSingleCurrency(results)) { 0199 return {}; 0200 } 0201 0202 const auto it = std::max_element(results.begin(), results.end(), [](const auto &lhs, const auto &rhs) { return lhs.value < rhs.value; }); 0203 return (*it); 0204 } 0205 0206 static bool equalIgnoreDiacritics(QStringView lhs, QStringView rhs) 0207 { 0208 if (lhs.size() != rhs.size()) { 0209 return false; 0210 } 0211 0212 for (qsizetype i = 0; i < lhs.size(); ++i) { 0213 auto l = lhs[i]; 0214 if (l.decompositionTag() == QChar::Canonical) { 0215 l = l.decomposition().at(0); 0216 } 0217 auto r = rhs[i]; 0218 if (r.decompositionTag() == QChar::Canonical) { 0219 r = r.decomposition().at(0); 0220 } 0221 if (l != r) { 0222 return false; 0223 } 0224 } 0225 0226 return true; 0227 } 0228 0229 QString PriceFinder::parseCurrency(QStringView s, CurrencyPosition pos) const 0230 { 0231 // trim remaining boundary chars 0232 if (s.isEmpty()) { 0233 return {}; 0234 } 0235 0236 // valid currency ISO code 0237 auto isoCandidate = s; 0238 while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.last())) { 0239 isoCandidate = isoCandidate.left(isoCandidate.size() - 1); 0240 } 0241 while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.front())) { 0242 isoCandidate = isoCandidate.mid(1); 0243 } 0244 if (isoCandidate.size() == 3) { 0245 const auto it = std::lower_bound(s_currencyData.begin(), s_currencyData.end(), isoCandidate, [](const auto &lhs, QStringView rhs) { return lhs.isoCode < rhs; }); 0246 if (it != s_currencyData.end() && (*it).isoCode == isoCandidate) { 0247 return (*it).isoCode; 0248 } 0249 } 0250 0251 // currency symbol 0252 const auto symbol = normalizeSymbol(s); 0253 // exact match: we know there is only ever going to be one (see ctor) 0254 const auto it = std::find_if(s_currencyData.begin(), s_currencyData.end(), [&symbol](const auto &data) { return data.symbol == symbol; }); 0255 if (it != s_currencyData.end()) 0256 return (*it).isoCode; 0257 0258 // partial match: needs to be unique 0259 QString isoCode; 0260 for (const auto &data : s_currencyData) { 0261 if (data.symbol.isEmpty()) { 0262 continue; 0263 } 0264 0265 // match disregarding diacritics 0266 if (equalIgnoreDiacritics(data.symbol, symbol)) { 0267 if (!isoCode.isEmpty()) { 0268 return {}; 0269 } 0270 isoCode = data.isoCode; 0271 } 0272 0273 // prefix or suffix match 0274 if (pos == CurrencyPrefix) { 0275 if (symbol.size() <= data.symbol.size() || !symbol.endsWith(data.symbol) || !isBoundaryChar(symbol.at(symbol.size() - data.symbol.size() - 1))) { 0276 continue; 0277 } 0278 } else { 0279 if (symbol.size() <= data.symbol.size() || !symbol.startsWith(data.symbol) || !isBoundaryChar(symbol.at(data.symbol.size()))) { 0280 continue; 0281 } 0282 } 0283 if (!isoCode.isEmpty()) { 0284 return {}; 0285 } 0286 isoCode = data.isoCode; 0287 } 0288 return isoCode; 0289 } 0290 0291 double PriceFinder::parseValue(QStringView s, const QString &isoCode) const 0292 { 0293 if (s.isEmpty() || !s[0].isDigit() || !s[s.size() - 1].isDigit()) { 0294 return NAN; 0295 } 0296 0297 // find potential decimal separator 0298 QChar decimalSeparator; 0299 qsizetype decimalSeparatorIndex = -1; 0300 for (qsizetype i = s.size() - 1; i > 0; --i) { 0301 if (s[i].isDigit()) { 0302 continue; 0303 } 0304 if (!s[i].isSpace()) { 0305 decimalSeparator = s[i]; 0306 decimalSeparatorIndex = i; 0307 } 0308 break; 0309 } 0310 0311 // identify/validate group separators 0312 QChar groupSeparator; 0313 qsizetype lastGroupSeparatorIndex = -1; 0314 for (qsizetype i = 0; i < s.size(); ++i) { 0315 if (s[i].isDigit()) { 0316 continue; 0317 } 0318 if (lastGroupSeparatorIndex > 0 && i - lastGroupSeparatorIndex != 4) { // separator interval is wrong 0319 return NAN; 0320 } 0321 if (decimalSeparatorIndex > 0 && i == decimalSeparatorIndex) { // found the suspected decimal separator 0322 break; 0323 } 0324 if (!groupSeparator.isNull() && s[i] != groupSeparator) { // inconsistent separators 0325 return NAN; 0326 } 0327 0328 lastGroupSeparatorIndex = i; 0329 groupSeparator = s[i]; 0330 } 0331 0332 // we found both and they are the same: has to be the group separator 0333 if (!decimalSeparator.isNull() && !groupSeparator.isNull() && decimalSeparator == groupSeparator) { 0334 if ((s.size() - decimalSeparatorIndex) != 4) { 0335 return NAN; 0336 } 0337 decimalSeparator = {}; 0338 decimalSeparatorIndex = -1; 0339 } 0340 0341 // we found a decimal separator: verify the number of decimals is consistent with the currency's subdivision 0342 // see https://en.wikipedia.org/wiki/List_of_circulating_currencies 0343 if (!decimalSeparator.isNull()) { 0344 const auto decimalCount = s.size() - decimalSeparatorIndex - 1; 0345 const auto expectedDecimalCount = PriceUtil::decimalCount(isoCode); 0346 0347 // subdivision x1000 is ambigious if we don't have a group separator 0348 if (decimalCount == expectedDecimalCount && decimalCount == 3 && groupSeparator.isNull()) { 0349 return NAN; 0350 } 0351 0352 // if decimal count is 3, assume group separator 0353 else if (decimalCount != expectedDecimalCount && decimalCount == 3) { 0354 if (groupSeparator.isNull()) { 0355 groupSeparator = decimalSeparator; 0356 decimalSeparator = {}; 0357 } else { 0358 return NAN; 0359 } 0360 } 0361 0362 else if (decimalCount > expectedDecimalCount) { 0363 return NAN; 0364 } 0365 } 0366 0367 // strip group separators, replace decimal separator 0368 auto normalized = s.toString(); 0369 if (!groupSeparator.isNull()) { 0370 normalized.remove(groupSeparator); 0371 } 0372 if (!decimalSeparator.isNull()) { 0373 normalized.replace(decimalSeparator, QLatin1Char('.')); 0374 } 0375 0376 bool ok = false; 0377 const auto value = normalized.toDouble(&ok); 0378 if (!ok) { 0379 return NAN; 0380 } 0381 return value; 0382 }