File indexing completed on 2024-05-12 11:48:04
0001 /* 0002 This file is part of the KDE libraries 0003 0004 SPDX-FileCopyrightText: 1999 Ian Zepp <icszepp@islc.net> 0005 SPDX-FileCopyrightText: 2006 Dominic Battre <dominic@battre.de> 0006 SPDX-FileCopyrightText: 2006 Martin Pool <mbp@canonical.com> 0007 0008 SPDX-License-Identifier: LGPL-2.0-or-later 0009 */ 0010 0011 #include "kstringhandler.h" 0012 0013 #include <stdlib.h> // random() 0014 0015 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 67) 0016 #include <QRegExp> // for the word ranges 0017 #endif 0018 #include <QRegularExpression> 0019 #include <QVector> 0020 0021 // 0022 // Capitalization routines 0023 // 0024 QString KStringHandler::capwords(const QString &text) 0025 { 0026 if (text.isEmpty()) { 0027 return text; 0028 } 0029 0030 const QString strippedText = text.trimmed(); 0031 const QString space = QString(QLatin1Char(' ')); 0032 const QStringList words = capwords(strippedText.split(space)); 0033 0034 QString result = text; 0035 result.replace(strippedText, words.join(space)); 0036 return result; 0037 } 0038 0039 QStringList KStringHandler::capwords(const QStringList &list) 0040 { 0041 QStringList tmp = list; 0042 for (auto &str : tmp) { 0043 str[0] = str.at(0).toUpper(); 0044 } 0045 return tmp; 0046 } 0047 0048 QString KStringHandler::lsqueeze(const QString &str, const int maxlen) 0049 { 0050 if (str.length() > maxlen) { 0051 const int part = maxlen - 3; 0052 return QLatin1String("...") + QStringView(str).right(part); 0053 } else { 0054 return str; 0055 } 0056 } 0057 0058 QString KStringHandler::csqueeze(const QString &str, const int maxlen) 0059 { 0060 if (str.length() > maxlen && maxlen > 3) { 0061 const int part = (maxlen - 3) / 2; 0062 const QStringView strView{str}; 0063 return strView.left(part) + QLatin1String("...") + strView.right(part); 0064 } else { 0065 return str; 0066 } 0067 } 0068 0069 QString KStringHandler::rsqueeze(const QString &str, const int maxlen) 0070 { 0071 if (str.length() > maxlen) { 0072 const int part = maxlen - 3; 0073 return QStringView(str).left(part) + QLatin1String("..."); 0074 } else { 0075 return str; 0076 } 0077 } 0078 0079 QStringList KStringHandler::perlSplit(const QStringView sep, const QStringView str, int max) 0080 { 0081 const bool ignoreMax = max == 0; 0082 0083 const int sepLength = sep.size(); 0084 0085 QStringList list; 0086 int searchStart = 0; 0087 int sepIndex = str.indexOf(sep, searchStart); 0088 0089 while (sepIndex != -1 && (ignoreMax || list.count() < max - 1)) { 0090 const auto chunk = str.mid(searchStart, sepIndex - searchStart); 0091 if (!chunk.isEmpty()) { 0092 list.append(chunk.toString()); 0093 } 0094 0095 searchStart = sepIndex + sepLength; 0096 sepIndex = str.indexOf(sep, searchStart); 0097 } 0098 0099 const auto lastChunk = str.mid(searchStart, str.length() - searchStart); 0100 if (!lastChunk.isEmpty()) { 0101 list.append(lastChunk.toString()); 0102 } 0103 0104 return list; 0105 } 0106 0107 QStringList KStringHandler::perlSplit(const QString &sep, const QString &s, int max) 0108 { 0109 return perlSplit(QStringView(sep), QStringView(s), max); 0110 } 0111 0112 QStringList KStringHandler::perlSplit(const QChar &sep, const QString &str, int max) 0113 { 0114 return perlSplit(QStringView(&sep, 1), QStringView(str), max); 0115 } 0116 0117 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 67) 0118 QStringList KStringHandler::perlSplit(const QRegExp &sep, const QString &s, const int max) 0119 { 0120 // nothing to split 0121 if (s.isEmpty()) { 0122 return QStringList(); 0123 } 0124 0125 const bool ignoreMax = 0 == max; 0126 0127 QStringList l; 0128 0129 int searchStart = 0; 0130 int tokenStart = sep.indexIn(s, searchStart); 0131 int len = sep.matchedLength(); 0132 0133 while (-1 != tokenStart && (ignoreMax || l.count() < max - 1)) { 0134 if (!s.midRef(searchStart, tokenStart - searchStart).isEmpty()) { 0135 l << s.mid(searchStart, tokenStart - searchStart); 0136 } 0137 0138 searchStart = tokenStart + len; 0139 tokenStart = sep.indexIn(s, searchStart); 0140 len = sep.matchedLength(); 0141 } 0142 0143 if (!s.midRef(searchStart, s.length() - searchStart).isEmpty()) { 0144 l << s.mid(searchStart, s.length() - searchStart); 0145 } 0146 0147 return l; 0148 } 0149 #endif 0150 0151 QStringList KStringHandler::perlSplit(const QRegularExpression &sep, const QString &str, int max) 0152 { 0153 // nothing to split 0154 if (str.isEmpty()) { 0155 return QStringList(); 0156 } 0157 0158 const bool ignoreMax = max == 0; 0159 0160 QStringList list; 0161 0162 int start = 0; 0163 0164 const QStringView strView(str); 0165 0166 QRegularExpression separator(sep); 0167 separator.setPatternOptions(QRegularExpression::UseUnicodePropertiesOption); 0168 0169 QRegularExpressionMatchIterator iter = separator.globalMatch(strView); 0170 QRegularExpressionMatch match; 0171 while (iter.hasNext() && (ignoreMax || list.count() < max - 1)) { 0172 match = iter.next(); 0173 const QStringView chunk = strView.mid(start, match.capturedStart() - start); 0174 if (!chunk.isEmpty()) { 0175 list.append(chunk.toString()); 0176 } 0177 0178 start = match.capturedEnd(); 0179 } 0180 0181 // catch the remainder 0182 const QStringView lastChunk = strView.mid(start, strView.size() - start); 0183 if (!lastChunk.isEmpty()) { 0184 list.append(lastChunk.toString()); 0185 } 0186 0187 return list; 0188 } 0189 0190 QString KStringHandler::tagUrls(const QString &text) 0191 { 0192 QString richText(text); 0193 0194 static const QRegularExpression urlEx(QStringLiteral(R"((www\.(?!\.)|(fish|ftp|http|https)://[\d\w./,:_~?=&;#@\-+%$()]+))"), 0195 QRegularExpression::UseUnicodePropertiesOption); 0196 // The reference \1 is going to be replaced by the matched url 0197 richText.replace(urlEx, QStringLiteral("<a href=\"\\1\">\\1</a>")); 0198 return richText; 0199 } 0200 0201 QString KStringHandler::obscure(const QString &str) 0202 { 0203 QString result; 0204 for (const QChar ch : str) { 0205 // yes, no typo. can't encode ' ' or '!' because 0206 // they're the unicode BOM. stupid scrambling. stupid. 0207 const ushort uc = ch.unicode(); 0208 result += (uc <= 0x21) ? ch : QChar(0x1001F - uc); 0209 } 0210 0211 return result; 0212 } 0213 0214 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 112) 0215 bool KStringHandler::isUtf8(const char *buf) 0216 { 0217 int i; 0218 int n; 0219 unsigned char c; 0220 bool gotone = false; 0221 0222 if (!buf) { 0223 return true; // whatever, just don't crash 0224 } 0225 0226 #define F 0 /* character never appears in text */ 0227 #define T 1 /* character appears in plain ASCII text */ 0228 #define I 2 /* character appears in ISO-8859 text */ 0229 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 0230 /* clang-format off */ 0231 static const unsigned char text_chars[256] = { 0232 /* BEL BS HT LF FF CR */ 0233 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 0234 /* ESC */ 0235 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 0236 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 0237 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 0238 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 0239 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 0240 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 0241 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 0242 /* NEL */ 0243 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 0244 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 0245 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 0246 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 0247 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 0248 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 0249 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 0250 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 0251 }; 0252 /* clang-format on */ 0253 0254 /* *ulen = 0; */ 0255 for (i = 0; (c = buf[i]); ++i) { 0256 if ((c & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 0257 /* 0258 * Even if the whole file is valid UTF-8 sequences, 0259 * still reject it if it uses weird control characters. 0260 */ 0261 0262 if (text_chars[c] != T) { 0263 return false; 0264 } 0265 0266 } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 0267 return false; 0268 } else { /* 11xxxxxx begins UTF-8 */ 0269 int following; 0270 0271 if ((c & 0x20) == 0) { /* 110xxxxx */ 0272 following = 1; 0273 } else if ((c & 0x10) == 0) { /* 1110xxxx */ 0274 following = 2; 0275 } else if ((c & 0x08) == 0) { /* 11110xxx */ 0276 following = 3; 0277 } else if ((c & 0x04) == 0) { /* 111110xx */ 0278 following = 4; 0279 } else if ((c & 0x02) == 0) { /* 1111110x */ 0280 following = 5; 0281 } else { 0282 return false; 0283 } 0284 0285 for (n = 0; n < following; ++n) { 0286 i++; 0287 if (!(c = buf[i])) { 0288 goto done; 0289 } 0290 0291 if ((c & 0x80) == 0 || (c & 0x40)) { 0292 return false; 0293 } 0294 } 0295 gotone = true; 0296 } 0297 } 0298 done: 0299 return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ 0300 } 0301 0302 #undef F 0303 #undef T 0304 #undef I 0305 #undef X 0306 0307 #endif 0308 0309 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 112) 0310 QString KStringHandler::from8Bit(const char *str) 0311 { 0312 if (!str) { 0313 return QString(); 0314 } 0315 if (!*str) { 0316 static const QLatin1String emptyString(""); 0317 return emptyString; 0318 } 0319 return KStringHandler::isUtf8(str) ? QString::fromUtf8(str) : QString::fromLocal8Bit(str); 0320 } 0321 #endif 0322 0323 QString KStringHandler::preProcessWrap(const QString &text) 0324 { 0325 const QChar zwsp(0x200b); 0326 0327 QString result; 0328 result.reserve(text.length()); 0329 0330 for (int i = 0; i < text.length(); i++) { 0331 const QChar c = text[i]; 0332 const bool openingParens = (c == QLatin1Char('(') || c == QLatin1Char('{') || c == QLatin1Char('[')); 0333 const bool singleQuote = (c == QLatin1Char('\'')); 0334 const bool closingParens = (c == QLatin1Char(')') || c == QLatin1Char('}') || c == QLatin1Char(']')); 0335 const bool breakAfter = (closingParens || c.isPunct() || c.isSymbol()); 0336 const bool isLastChar = i == (text.length() - 1); 0337 const bool isLower = c.isLower(); 0338 const bool nextIsUpper = !isLastChar && text[i + 1].isUpper(); // false by default 0339 const bool nextIsSpace = isLastChar || text[i + 1].isSpace(); // true by default 0340 const bool prevIsSpace = (i == 0 || text[i - 1].isSpace() || result[result.length() - 1] == zwsp); 0341 0342 // Provide a breaking opportunity before opening parenthesis 0343 if (openingParens && !prevIsSpace) { 0344 result += zwsp; 0345 } 0346 0347 // Provide a word joiner before the single quote 0348 if (singleQuote && !prevIsSpace) { 0349 result += QChar(0x2060); 0350 } 0351 0352 result += c; 0353 0354 // Provide a breaking opportunity between camelCase and PascalCase sub-words 0355 const bool isCamelCase = isLower && nextIsUpper; 0356 0357 if (isCamelCase || (breakAfter && !openingParens && !nextIsSpace && !singleQuote)) { 0358 result += zwsp; 0359 } 0360 } 0361 0362 return result; 0363 } 0364 0365 int KStringHandler::logicalLength(const QString &text) 0366 { 0367 int length = 0; 0368 const auto chrs = text.toUcs4(); 0369 for (const auto chr : chrs) { 0370 const auto script = QChar::script(chr); 0371 /* clang-format off */ 0372 if (script == QChar::Script_Han 0373 || script == QChar::Script_Hangul 0374 || script == QChar::Script_Hiragana 0375 || script == QChar::Script_Katakana 0376 || script == QChar::Script_Yi 0377 || QChar::isHighSurrogate(chr)) { /* clang-format on */ 0378 length += 2; 0379 } else { 0380 length += 1; 0381 } 0382 } 0383 return length; 0384 }