src/lib/stringutil.cpp

0001 /*
0002     SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
0003
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006
0007 #include "stringutil.h"
0008
0009 #include <KCharsets>
0010
0011 #include <QDebug>
0012 #include <QString>
0013
0014 #include <cstring>
0015 #include <cctype>
0016
0017 using namespace KItinerary;
0018
0019 QString StringUtil::normalize(QStringView str)
0020 {
0021     QString out;
0022     out.reserve(str.size());
0023     for (const auto c : str) {
0024         // case folding
0025         const auto n = c.toCaseFolded();
0026
0027         // if the character has a canonical decomposition use that and skip the
0028         // combining diacritic markers following it
0029         // see https://en.wikipedia.org/wiki/Unicode_equivalence
0030         // see https://en.wikipedia.org/wiki/Combining_character
0031         if (n.decompositionTag() == QChar::Canonical) {
0032             out.push_back(n.decomposition().at(0));
0033         }
0034         // handle compatibility compositions such as ligatures
0035         // see https://en.wikipedia.org/wiki/Unicode_compatibility_characters
0036         else if (n.decompositionTag() == QChar::Compat && n.isLetter() && n.script() == QChar::Script_Latin) {
0037             out.append(n.decomposition());
0038         }
0039         else {
0040             out.push_back(n);
0041         }
0042     }
0043     return out;
0044 }
0045
0046 static bool containsNonAscii(QStringView s)
0047 {
0048     for (const auto c : s) {
0049         if (c.row() != 0 || c.cell() > 127) {
0050             return true;
0051         }
0052     }
0053
0054     return false;
0055 }
0056
0057 static bool isMixedCase(QStringView s)
0058 {
0059     const auto letterCount = std::count_if(s.begin(), s.end(), [](auto c) { return c.isLetter(); });
0060     const auto upperCount = std::count_if(s.begin(), s.end(), [](auto c) { return c.isUpper(); });
0061     return upperCount != letterCount && upperCount != 0;
0062 }
0063
0064 static int longestUpperCaseSubstring(QStringView s)
0065 {
0066     int globalCount = 0;
0067     int count = 0;
0068     for (const auto c : s) {
0069         if (c.isUpper()) {
0070             ++count;
0071             continue;
0072         }
0073         globalCount = std::max(globalCount, count);
0074         count = 0;
0075     }
0076     return std::max(globalCount, count);
0077 }
0078
0079 QStringView StringUtil::betterString(QStringView lhs, QStringView rhs)
0080 {
0081     // prefer the one that exists at all
0082     if (lhs.isEmpty()) {
0083         return rhs;
0084     }
0085     if (rhs.isEmpty()) {
0086         return lhs;
0087     }
0088
0089     // prefer Unicode over ASCII normalization
0090     const auto lhsNonAscii = containsNonAscii(lhs);
0091     const auto rhsNonAscii = containsNonAscii(rhs);
0092     if (lhsNonAscii && !rhsNonAscii) {
0093         return lhs;
0094     }
0095     if (!lhsNonAscii && rhsNonAscii) {
0096         return rhs;
0097     }
0098
0099     // prefer better casing
0100     const auto lhsMixedCase = isMixedCase(lhs);
0101     const auto rhsMixedCase = isMixedCase(rhs);
0102     if (lhsMixedCase && !rhsMixedCase) {
0103         return lhs;
0104     }
0105     if (!lhsMixedCase && rhsMixedCase) {
0106         return rhs;
0107     }
0108
0109     if (lhs.size() == rhs.size()) {
0110         if (lhsMixedCase && rhsMixedCase) {
0111             if (longestUpperCaseSubstring(lhs) > longestUpperCaseSubstring(rhs)) {
0112                 return rhs;
0113             } else if (longestUpperCaseSubstring(lhs) < longestUpperCaseSubstring(rhs)) {
0114                 return lhs;
0115             }
0116         }
0117         if (!lhsMixedCase && !rhsMixedCase) {
0118             if (longestUpperCaseSubstring(lhs) > longestUpperCaseSubstring(rhs)) {
0119                 return lhs;
0120             }
0121             else if (longestUpperCaseSubstring(lhs) < longestUpperCaseSubstring(rhs)) {
0122                 return rhs;
0123             }
0124         }
0125     }
0126
0127     // prefer longer == more detailed version
0128     if (rhs.size() < lhs.size()) {
0129         return lhs;
0130     }
0131     return rhs;
0132 }
0133
0134 float StringUtil::prefixSimilarity(QStringView s1, QStringView s2)
0135 {
0136     if (s1.empty() || s2.empty()) {
0137         return 0.0f;
0138     }
0139
0140     if (s1.size() > s2.size()) {
0141         std::swap(s1, s2);
0142     }
0143
0144     for (int i = 0; i < s1.size(); ++i) {
0145         if (s1[i].toCaseFolded() == s2[i].toCaseFolded()) {
0146             continue;
0147         }
0148         return (float)i / (float)s2.size();
0149     }
0150
0151     return (float)s1.size() / (float)s2.size();
0152 }
0153
0154 QString StringUtil::clean(const QString &s)
0155 {
0156     return KCharsets::resolveEntities(s).simplified();
0157 }
0158
0159 // keep this ordered (see https://en.wikipedia.org/wiki/List_of_Unicode_characters)
0160 struct {
0161     ushort key;
0162     const char* replacement;
0163 } static const transliteration_map[] = {
0164     { u'ä', "ae" },
0165     { u'ö', "oe" },
0166     { u'ø', "oe" },
0167     { u'ü', "ue" },
0168     { u'ő', "oe" },
0169 };
0170
0171 QString StringUtil::transliterate(QStringView s)
0172 {
0173     QString res;
0174     res.reserve(s.size());
0175
0176     for (const auto c : s) {
0177         const auto it = std::lower_bound(std::begin(transliteration_map), std::end(transliteration_map), c, [](const auto &lhs, const auto rhs) {
0178             return QChar(lhs.key) < rhs;
0179         });
0180         if (it != std::end(transliteration_map) && QChar((*it).key) == c) {
0181             res += QString::fromUtf8((*it).replacement);
0182             continue;
0183         }
0184
0185         if (c.decompositionTag() == QChar::Canonical) { // see above
0186             res += c.decomposition().at(0);
0187         } else {
0188             res += c;
0189         }
0190     }
0191
0192     return res;
0193 }
0194
0195 bool StringUtil::startsWithIgnoreSpace(const QByteArray &data, const char *pattern)
0196 {
0197     auto it = data.begin();
0198     while (it != data.end() && std::isspace(static_cast<unsigned char>(*it))) {
0199         ++it;
0200     }
0201
0202     const auto len = std::strlen(pattern);
0203     if ((int)len >= std::distance(it, data.end())) {
0204         return false;
0205     }
0206     return std::strncmp(it, pattern, len) == 0;
0207 }