File indexing completed on 2024-12-29 04:51:07

0001 /*
0002     SPDX-FileCopyrightText: 2023 Volker Krause <vkrause@kde.org>
0003     SPDX-License-Identifier: LGPL-2.0-or-later
0004 */
0005 
0006 #include "pricefinder_p.h"
0007 
0008 #include <KItinerary/PriceUtil>
0009 
0010 #include <QDebug>
0011 #include <QLocale>
0012 #include <QRegularExpression>
0013 
0014 #include <cmath>
0015 #include <cstring>
0016 
0017 using namespace KItinerary;
0018 
0019 std::vector<PriceFinder::CurrencyData> PriceFinder::s_currencyData;
0020 
0021 // normalize currency symbols, as e.g. "wide Yen" and "normal Yen" should be considered the same
0022 static QString normalizeSymbol(QStringView str)
0023 {
0024     QString out;
0025     out.reserve(str.size());
0026     for (const auto c : str) {
0027         if (c.decompositionTag() == QChar::Wide) {
0028             out.push_back(c.decomposition().at(0));
0029         } else {
0030             out.push_back(c);
0031         }
0032     }
0033     return out;
0034 }
0035 
0036 static bool isCollidingSymbol(QStringView lhs, QStringView rhs)
0037 {
0038     return lhs == rhs
0039         || (lhs.size() == rhs.size() + 1 && lhs.back() == QLatin1Char('.') && lhs.startsWith(rhs))
0040         || (rhs.size() == lhs.size() + 1 && rhs.back() == QLatin1Char('.') && rhs.startsWith(lhs));
0041 }
0042 
0043 // overrides to QLocale data
0044 // ### keep sorted by ISO code
0045 struct {
0046     const char isoCode[4];
0047     const char *symbol;
0048 } static constexpr const currency_data_overrides[] = {
0049     { "BAM", nullptr }, // BAM's symbol is "KM", which collides with distance values on train tickets too often
0050     { "GBP", "£" }, // FKP, GIP and SHP are practically GPB-equivalent using the pound sign, SSP has it wrongly assigned in QLocale
0051     { "JPY", "円"}, // the Yen sign is also used by CNY and thus ambigious, but the Japanese Yen symbol works
0052 };
0053 
0054 PriceFinder::PriceFinder()
0055 {
0056     if (!s_currencyData.empty()) {
0057         return;
0058     }
0059 
0060     const auto allLocales = QLocale::matchingLocales(QLocale::AnyLanguage, QLocale::AnyScript, QLocale::AnyCountry);
0061     for (const auto &locale : allLocales) {
0062         CurrencyData data{locale.currencySymbol(QLocale::CurrencyIsoCode), normalizeSymbol(locale.currencySymbol(QLocale::CurrencySymbol))};
0063         if (data.isoCode.isEmpty()) {
0064             continue;
0065         }
0066 
0067         // single letter symbols tend to be way too trigger-happy
0068         if (data.symbol.size() == 1 && data.symbol[0].isLetter()) {
0069             //qDebug() << "Dropping single letter symbol:" << data.symbol << data.isoCode;
0070             data.symbol.clear();
0071         }
0072 
0073         s_currencyData.push_back(std::move(data));
0074     }
0075 
0076     // remove duplicates
0077     const auto lessThanCurrencyData = [](const auto &lhs, const auto &rhs) {
0078         return std::tie(lhs.isoCode, lhs.symbol) < std::tie(rhs.isoCode, rhs.symbol);
0079     };
0080     std::sort(s_currencyData.begin(), s_currencyData.end(), lessThanCurrencyData);
0081     const auto compareCurrencyData = [](const auto &lhs, const auto &rhs) {
0082         return lhs.isoCode == rhs.isoCode && lhs.symbol == rhs.symbol;
0083     };
0084     s_currencyData.erase(std::unique(s_currencyData.begin(), s_currencyData.end(), compareCurrencyData), s_currencyData.end());
0085 
0086     // clear ambigious symbols
0087     for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) {
0088         if ((*it).symbol.isEmpty()) {
0089             continue;
0090         }
0091         bool collision = false;
0092         for (auto it2 = std::next(it); it2 != s_currencyData.end(); ++it2) {
0093             if (!isCollidingSymbol((*it).symbol, (*it2).symbol)) {
0094                 continue;
0095             }
0096             (*it2).symbol.clear();
0097             if (!collision) {
0098                 qDebug() << "Ambigious currency symbol:" << (*it).symbol;
0099             }
0100             collision = true;
0101         }
0102         if (collision) {
0103             (*it).symbol.clear();
0104         }
0105     }
0106 
0107     // apply our own overrides over QLocale
0108     for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) {
0109         const auto it2 = std::lower_bound(std::begin(currency_data_overrides), std::end(currency_data_overrides), (*it).isoCode, [](const auto &lhs, const auto &rhs) {
0110             return std::strncmp(lhs.isoCode, rhs.toLatin1().constData(), 3) < 0;
0111         });
0112         if (it2 == std::end(currency_data_overrides) || std::strncmp((*it2).isoCode, (*it).isoCode.toLatin1().constData(), 3) != 0) {
0113             continue;
0114         }
0115         (*it).symbol = (*it2).symbol ? QString::fromUtf8((*it2).symbol) : QString();
0116     }
0117 }
0118 
0119 PriceFinder::~PriceFinder() = default;
0120 
0121 static bool isBoundaryChar(QChar c)
0122 {
0123     return c != QLatin1Char('-') && (c.isSpace() || c.isPunct() || c.isSymbol());
0124 }
0125 
0126 void PriceFinder::findAll(QStringView text, std::vector<Result> &results) const
0127 {
0128     static QRegularExpression rx(QStringLiteral(R"((?<=\s|[[:punct:]]|^)([^\d\s]{1,4})?[  ]*(\d(?:[\d,.  ]*\d)?)[  ]*([^\d\s]{1,4})?(?=\s|[[:punct:]]|$))"));
0129 
0130     const auto prevResultSize = results.size();
0131     qsizetype offset = 0;
0132     while (true) {
0133         const auto match = rx.matchView(text, offset);
0134         if (!match.hasMatch()) {
0135             break;
0136         }
0137         offset = match.capturedEnd(2);
0138 
0139         const auto leadingCurrency = parseCurrency(match.capturedView(1), CurrencyPrefix);
0140         const auto trailingCurrency = parseCurrency(match.capturedView(3), CurrencySuffix);
0141         if ((leadingCurrency.isEmpty() && trailingCurrency.isEmpty()) || (!leadingCurrency.isEmpty() && !trailingCurrency.isEmpty() && leadingCurrency != trailingCurrency)) {
0142             continue;
0143         }
0144 
0145         // additional boundary checks not covered by the regular expression
0146         if (leadingCurrency.isEmpty() && match.capturedStart(2) > 0 && !isBoundaryChar(text[match.capturedStart(2) - 1])) {
0147             continue;
0148         }
0149         if (trailingCurrency.isEmpty() && match.capturedEnd(2) < text.size() - 2 && !isBoundaryChar(text[match.capturedEnd(2)])) {
0150             continue;
0151         }
0152 
0153         Result r;
0154         r.start = leadingCurrency.isEmpty() ? match.capturedStart(2) : match.capturedStart();
0155         r.end = trailingCurrency.isEmpty() ? match.capturedEnd(2) : match.capturedEnd();
0156         r.currency = leadingCurrency.isEmpty() ? trailingCurrency : leadingCurrency;
0157 
0158         r.value = parseValue(match.capturedView(2), r.currency);
0159         if (std::isnan(r.value)) {
0160             continue;
0161         }
0162 
0163         results.push_back(std::move(r));
0164     }
0165 
0166     // check for overlapping results: in those case we have to assume the entire result is invalid
0167     if (results.size() <= 1 + prevResultSize) {
0168         return;
0169     }
0170     for (auto it = results.begin() + prevResultSize; it != std::prev(results.end()); ++it) {
0171         if ((*it).end >= (*std::next(it)).start) {
0172             qDebug() << "overlapping price data, discarding result";
0173             results.erase(results.begin() + prevResultSize, results.end());
0174             return;
0175         }
0176     }
0177 }
0178 
0179 PriceFinder::Result PriceFinder::findHighest(QStringView text) const
0180 {
0181     std::vector<Result> results;
0182     findAll(text, results);
0183     return highest(results);
0184 }
0185 
0186 bool PriceFinder::isSingleCurrency(const std::vector<Result> &results) const
0187 {
0188     if (results.empty()) {
0189         return false;
0190     }
0191 
0192     const auto isoCode = results.front().currency;
0193     return std::all_of(results.begin(), results.end(), [&isoCode](const auto &r) { return r.currency == isoCode; });
0194 }
0195 
0196 PriceFinder::Result PriceFinder::highest(const std::vector<Result> &results) const
0197 {
0198     if (!isSingleCurrency(results)) {
0199         return {};
0200     }
0201 
0202     const auto it = std::max_element(results.begin(), results.end(), [](const auto &lhs, const auto &rhs) { return lhs.value < rhs.value; });
0203     return (*it);
0204 }
0205 
0206 static bool equalIgnoreDiacritics(QStringView lhs, QStringView rhs)
0207 {
0208     if (lhs.size() != rhs.size()) {
0209         return false;
0210     }
0211 
0212     for (qsizetype i = 0; i < lhs.size(); ++i) {
0213         auto l = lhs[i];
0214         if (l.decompositionTag() == QChar::Canonical) {
0215             l = l.decomposition().at(0);
0216         }
0217         auto r = rhs[i];
0218         if (r.decompositionTag() == QChar::Canonical) {
0219             r = r.decomposition().at(0);
0220         }
0221         if (l != r) {
0222             return false;
0223         }
0224     }
0225 
0226     return true;
0227 }
0228 
0229 QString PriceFinder::parseCurrency(QStringView s, CurrencyPosition pos) const
0230 {
0231     // trim remaining boundary chars
0232     if (s.isEmpty()) {
0233         return {};
0234     }
0235 
0236     // valid currency ISO code
0237     auto isoCandidate = s;
0238     while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.last())) {
0239         isoCandidate = isoCandidate.left(isoCandidate.size() - 1);
0240     }
0241     while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.front())) {
0242         isoCandidate = isoCandidate.mid(1);
0243     }
0244     if (isoCandidate.size() == 3) {
0245         const auto it = std::lower_bound(s_currencyData.begin(), s_currencyData.end(), isoCandidate, [](const auto &lhs, QStringView rhs) { return lhs.isoCode < rhs; });
0246         if (it != s_currencyData.end() && (*it).isoCode == isoCandidate) {
0247             return (*it).isoCode;
0248         }
0249     }
0250 
0251     // currency symbol
0252     const auto symbol = normalizeSymbol(s);
0253     // exact match: we know there is only ever going to be one (see ctor)
0254     const auto it = std::find_if(s_currencyData.begin(), s_currencyData.end(), [&symbol](const auto &data) { return data.symbol == symbol; });
0255     if (it != s_currencyData.end())
0256         return (*it).isoCode;
0257 
0258     // partial match: needs to be unique
0259     QString isoCode;
0260     for (const auto &data : s_currencyData) {
0261         if (data.symbol.isEmpty()) {
0262             continue;
0263         }
0264 
0265         // match disregarding diacritics
0266         if (equalIgnoreDiacritics(data.symbol, symbol)) {
0267             if (!isoCode.isEmpty()) {
0268                 return {};
0269             }
0270             isoCode = data.isoCode;
0271         }
0272 
0273         // prefix or suffix match
0274         if (pos == CurrencyPrefix) {
0275             if (symbol.size() <= data.symbol.size() || !symbol.endsWith(data.symbol) || !isBoundaryChar(symbol.at(symbol.size() - data.symbol.size() - 1))) {
0276                 continue;
0277             }
0278         } else {
0279             if (symbol.size() <= data.symbol.size() || !symbol.startsWith(data.symbol) || !isBoundaryChar(symbol.at(data.symbol.size()))) {
0280                 continue;
0281             }
0282         }
0283         if (!isoCode.isEmpty()) {
0284             return {};
0285         }
0286         isoCode = data.isoCode;
0287     }
0288     return isoCode;
0289 }
0290 
0291 double PriceFinder::parseValue(QStringView s, const QString &isoCode) const
0292 {
0293     if (s.isEmpty() || !s[0].isDigit() || !s[s.size() - 1].isDigit()) {
0294         return NAN;
0295     }
0296 
0297     // find potential decimal separator
0298     QChar decimalSeparator;
0299     qsizetype decimalSeparatorIndex = -1;
0300     for (qsizetype i = s.size() - 1; i > 0; --i) {
0301         if (s[i].isDigit()) {
0302             continue;
0303         }
0304         if (!s[i].isSpace()) {
0305             decimalSeparator = s[i];
0306             decimalSeparatorIndex = i;
0307         }
0308         break;
0309     }
0310 
0311     // identify/validate group separators
0312     QChar groupSeparator;
0313     qsizetype lastGroupSeparatorIndex = -1;
0314     for (qsizetype i = 0; i < s.size(); ++i) {
0315         if (s[i].isDigit()) {
0316             continue;
0317         }
0318         if (lastGroupSeparatorIndex > 0 && i - lastGroupSeparatorIndex != 4) { // separator interval is wrong
0319             return NAN;
0320         }
0321         if (decimalSeparatorIndex > 0 && i == decimalSeparatorIndex) { // found the suspected decimal separator
0322             break;
0323         }
0324         if (!groupSeparator.isNull() && s[i] != groupSeparator) { // inconsistent separators
0325             return NAN;
0326         }
0327 
0328         lastGroupSeparatorIndex = i;
0329         groupSeparator = s[i];
0330     }
0331 
0332     // we found both and they are the same: has to be the group separator
0333     if (!decimalSeparator.isNull() && !groupSeparator.isNull() && decimalSeparator == groupSeparator) {
0334         if ((s.size() - decimalSeparatorIndex) != 4) {
0335             return NAN;
0336         }
0337         decimalSeparator = {};
0338         decimalSeparatorIndex = -1;
0339     }
0340 
0341     // we found a decimal separator: verify the number of decimals is consistent with the currency's subdivision
0342     // see https://en.wikipedia.org/wiki/List_of_circulating_currencies
0343     if (!decimalSeparator.isNull()) {
0344         const auto decimalCount = s.size() - decimalSeparatorIndex - 1;
0345         const auto expectedDecimalCount = PriceUtil::decimalCount(isoCode);
0346 
0347         // subdivision x1000 is ambigious if we don't have a group separator
0348         if (decimalCount == expectedDecimalCount && decimalCount == 3 && groupSeparator.isNull()) {
0349             return NAN;
0350         }
0351 
0352         // if decimal count is 3, assume group separator
0353         else if (decimalCount != expectedDecimalCount && decimalCount == 3) {
0354             if (groupSeparator.isNull()) {
0355                 groupSeparator = decimalSeparator;
0356                 decimalSeparator = {};
0357             } else {
0358                 return NAN;
0359             }
0360         }
0361 
0362         else if (decimalCount > expectedDecimalCount) {
0363             return NAN;
0364         }
0365     }
0366 
0367     // strip group separators, replace decimal separator
0368     auto normalized = s.toString();
0369     if (!groupSeparator.isNull()) {
0370         normalized.remove(groupSeparator);
0371     }
0372     if (!decimalSeparator.isNull()) {
0373         normalized.replace(decimalSeparator, QLatin1Char('.'));
0374     }
0375 
0376     bool ok = false;
0377     const auto value = normalized.toDouble(&ok);
0378     if (!ok) {
0379         return NAN;
0380     }
0381     return value;
0382 }