Warning, file /pim/kitinerary/src/lib/stringutil.cpp was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 /* 0002 SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #include "stringutil.h" 0008 0009 #include <KCharsets> 0010 0011 #include <QDebug> 0012 #include <QString> 0013 0014 #include <cstring> 0015 #include <cctype> 0016 0017 using namespace KItinerary; 0018 0019 QString StringUtil::normalize(QStringView str) 0020 { 0021 QString out; 0022 out.reserve(str.size()); 0023 for (const auto c : str) { 0024 // case folding 0025 const auto n = c.toCaseFolded(); 0026 0027 // if the character has a canonical decomposition use that and skip the 0028 // combining diacritic markers following it 0029 // see https://en.wikipedia.org/wiki/Unicode_equivalence 0030 // see https://en.wikipedia.org/wiki/Combining_character 0031 if (n.decompositionTag() == QChar::Canonical) { 0032 out.push_back(n.decomposition().at(0)); 0033 } 0034 // handle compatibility compositions such as ligatures 0035 // see https://en.wikipedia.org/wiki/Unicode_compatibility_characters 0036 else if (n.decompositionTag() == QChar::Compat && n.isLetter() && n.script() == QChar::Script_Latin) { 0037 out.append(n.decomposition()); 0038 } 0039 else { 0040 out.push_back(n); 0041 } 0042 } 0043 return out; 0044 } 0045 0046 static bool containsNonAscii(QStringView s) 0047 { 0048 for (const auto c : s) { 0049 if (c.row() != 0 || c.cell() > 127) { 0050 return true; 0051 } 0052 } 0053 0054 return false; 0055 } 0056 0057 static bool isMixedCase(QStringView s) 0058 { 0059 const auto letterCount = std::count_if(s.begin(), s.end(), [](auto c) { return c.isLetter(); }); 0060 const auto upperCount = std::count_if(s.begin(), s.end(), [](auto c) { return c.isUpper(); }); 0061 return upperCount != letterCount && upperCount != 0; 0062 } 0063 0064 static int longestUpperCaseSubstring(QStringView s) 0065 { 0066 int globalCount = 0; 0067 int count = 0; 0068 for (const auto c : s) { 0069 if (c.isUpper()) { 0070 ++count; 0071 continue; 0072 } 0073 globalCount = std::max(globalCount, count); 0074 count = 0; 0075 } 0076 return std::max(globalCount, count); 0077 } 0078 0079 QStringView StringUtil::betterString(QStringView lhs, QStringView rhs) 0080 { 0081 // prefer the one that exists at all 0082 if (lhs.isEmpty()) { 0083 return rhs; 0084 } 0085 if (rhs.isEmpty()) { 0086 return lhs; 0087 } 0088 0089 // prefer Unicode over ASCII normalization 0090 const auto lhsNonAscii = containsNonAscii(lhs); 0091 const auto rhsNonAscii = containsNonAscii(rhs); 0092 if (lhsNonAscii && !rhsNonAscii) { 0093 return lhs; 0094 } 0095 if (!lhsNonAscii && rhsNonAscii) { 0096 return rhs; 0097 } 0098 0099 // prefer better casing 0100 const auto lhsMixedCase = isMixedCase(lhs); 0101 const auto rhsMixedCase = isMixedCase(rhs); 0102 if (lhsMixedCase && !rhsMixedCase) { 0103 return lhs; 0104 } 0105 if (!lhsMixedCase && rhsMixedCase) { 0106 return rhs; 0107 } 0108 0109 if (lhs.size() == rhs.size()) { 0110 if (lhsMixedCase && rhsMixedCase) { 0111 if (longestUpperCaseSubstring(lhs) > longestUpperCaseSubstring(rhs)) { 0112 return rhs; 0113 } else if (longestUpperCaseSubstring(lhs) < longestUpperCaseSubstring(rhs)) { 0114 return lhs; 0115 } 0116 } 0117 if (!lhsMixedCase && !rhsMixedCase) { 0118 if (longestUpperCaseSubstring(lhs) > longestUpperCaseSubstring(rhs)) { 0119 return lhs; 0120 } 0121 else if (longestUpperCaseSubstring(lhs) < longestUpperCaseSubstring(rhs)) { 0122 return rhs; 0123 } 0124 } 0125 } 0126 0127 // prefer longer == more detailed version 0128 if (rhs.size() < lhs.size()) { 0129 return lhs; 0130 } 0131 return rhs; 0132 } 0133 0134 float StringUtil::prefixSimilarity(QStringView s1, QStringView s2) 0135 { 0136 if (s1.empty() || s2.empty()) { 0137 return 0.0f; 0138 } 0139 0140 if (s1.size() > s2.size()) { 0141 std::swap(s1, s2); 0142 } 0143 0144 for (int i = 0; i < s1.size(); ++i) { 0145 if (s1[i].toCaseFolded() == s2[i].toCaseFolded()) { 0146 continue; 0147 } 0148 return (float)i / (float)s2.size(); 0149 } 0150 0151 return (float)s1.size() / (float)s2.size(); 0152 } 0153 0154 QString StringUtil::clean(const QString &s) 0155 { 0156 return KCharsets::resolveEntities(s).simplified(); 0157 } 0158 0159 // keep this ordered (see https://en.wikipedia.org/wiki/List_of_Unicode_characters) 0160 struct { 0161 ushort key; 0162 const char* replacement; 0163 } static const transliteration_map[] = { 0164 { u'ä', "ae" }, 0165 { u'ö', "oe" }, 0166 { u'ø', "oe" }, 0167 { u'ü', "ue" }, 0168 { u'ő', "oe" }, 0169 }; 0170 0171 QString StringUtil::transliterate(QStringView s) 0172 { 0173 QString res; 0174 res.reserve(s.size()); 0175 0176 for (const auto c : s) { 0177 const auto it = std::lower_bound(std::begin(transliteration_map), std::end(transliteration_map), c, [](const auto &lhs, const auto rhs) { 0178 return QChar(lhs.key) < rhs; 0179 }); 0180 if (it != std::end(transliteration_map) && QChar((*it).key) == c) { 0181 res += QString::fromUtf8((*it).replacement); 0182 continue; 0183 } 0184 0185 if (c.decompositionTag() == QChar::Canonical) { // see above 0186 res += c.decomposition().at(0); 0187 } else { 0188 res += c; 0189 } 0190 } 0191 0192 return res; 0193 } 0194 0195 bool StringUtil::startsWithIgnoreSpace(const QByteArray &data, const char *pattern) 0196 { 0197 auto it = data.begin(); 0198 while (it != data.end() && std::isspace(static_cast<unsigned char>(*it))) { 0199 ++it; 0200 } 0201 0202 const auto len = std::strlen(pattern); 0203 if ((int)len >= std::distance(it, data.end())) { 0204 return false; 0205 } 0206 return std::strncmp(it, pattern, len) == 0; 0207 }