lib/text/kstringhandler.cpp

0001 /*
0002     This file is part of the KDE libraries
0003
0004     SPDX-FileCopyrightText: 1999 Ian Zepp <icszepp@islc.net>
0005     SPDX-FileCopyrightText: 2006 Dominic Battre <dominic@battre.de>
0006     SPDX-FileCopyrightText: 2006 Martin Pool <mbp@canonical.com>
0007
0008     SPDX-License-Identifier: LGPL-2.0-or-later
0009 */
0010
0011 #include "kstringhandler.h"
0012
0013 #include <stdlib.h> // random()
0014
0015 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 67)
0016 #include <QRegExp> // for the word ranges
0017 #endif
0018 #include <QRegularExpression>
0019 #include <QVector>
0020
0021 //
0022 // Capitalization routines
0023 //
0024 QString KStringHandler::capwords(const QString &text)
0025 {
0026     if (text.isEmpty()) {
0027         return text;
0028     }
0029
0030     const QString strippedText = text.trimmed();
0031     const QString space = QString(QLatin1Char(' '));
0032     const QStringList words = capwords(strippedText.split(space));
0033
0034     QString result = text;
0035     result.replace(strippedText, words.join(space));
0036     return result;
0037 }
0038
0039 QStringList KStringHandler::capwords(const QStringList &list)
0040 {
0041     QStringList tmp = list;
0042     for (auto &str : tmp) {
0043         str[0] = str.at(0).toUpper();
0044     }
0045     return tmp;
0046 }
0047
0048 QString KStringHandler::lsqueeze(const QString &str, const int maxlen)
0049 {
0050     if (str.length() > maxlen) {
0051         const int part = maxlen - 3;
0052         return QLatin1String("...") + QStringView(str).right(part);
0053     } else {
0054         return str;
0055     }
0056 }
0057
0058 QString KStringHandler::csqueeze(const QString &str, const int maxlen)
0059 {
0060     if (str.length() > maxlen && maxlen > 3) {
0061         const int part = (maxlen - 3) / 2;
0062         const QStringView strView{str};
0063         return strView.left(part) + QLatin1String("...") + strView.right(part);
0064     } else {
0065         return str;
0066     }
0067 }
0068
0069 QString KStringHandler::rsqueeze(const QString &str, const int maxlen)
0070 {
0071     if (str.length() > maxlen) {
0072         const int part = maxlen - 3;
0073         return QStringView(str).left(part) + QLatin1String("...");
0074     } else {
0075         return str;
0076     }
0077 }
0078
0079 QStringList KStringHandler::perlSplit(const QStringView sep, const QStringView str, int max)
0080 {
0081     const bool ignoreMax = max == 0;
0082
0083     const int sepLength = sep.size();
0084
0085     QStringList list;
0086     int searchStart = 0;
0087     int sepIndex = str.indexOf(sep, searchStart);
0088
0089     while (sepIndex != -1 && (ignoreMax || list.count() < max - 1)) {
0090         const auto chunk = str.mid(searchStart, sepIndex - searchStart);
0091         if (!chunk.isEmpty()) {
0092             list.append(chunk.toString());
0093         }
0094
0095         searchStart = sepIndex + sepLength;
0096         sepIndex = str.indexOf(sep, searchStart);
0097     }
0098
0099     const auto lastChunk = str.mid(searchStart, str.length() - searchStart);
0100     if (!lastChunk.isEmpty()) {
0101         list.append(lastChunk.toString());
0102     }
0103
0104     return list;
0105 }
0106
0107 QStringList KStringHandler::perlSplit(const QString &sep, const QString &s, int max)
0108 {
0109     return perlSplit(QStringView(sep), QStringView(s), max);
0110 }
0111
0112 QStringList KStringHandler::perlSplit(const QChar &sep, const QString &str, int max)
0113 {
0114     return perlSplit(QStringView(&sep, 1), QStringView(str), max);
0115 }
0116
0117 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 67)
0118 QStringList KStringHandler::perlSplit(const QRegExp &sep, const QString &s, const int max)
0119 {
0120     // nothing to split
0121     if (s.isEmpty()) {
0122         return QStringList();
0123     }
0124
0125     const bool ignoreMax = 0 == max;
0126
0127     QStringList l;
0128
0129     int searchStart = 0;
0130     int tokenStart = sep.indexIn(s, searchStart);
0131     int len = sep.matchedLength();
0132
0133     while (-1 != tokenStart && (ignoreMax || l.count() < max - 1)) {
0134         if (!s.midRef(searchStart, tokenStart - searchStart).isEmpty()) {
0135             l << s.mid(searchStart, tokenStart - searchStart);
0136         }
0137
0138         searchStart = tokenStart + len;
0139         tokenStart = sep.indexIn(s, searchStart);
0140         len = sep.matchedLength();
0141     }
0142
0143     if (!s.midRef(searchStart, s.length() - searchStart).isEmpty()) {
0144         l << s.mid(searchStart, s.length() - searchStart);
0145     }
0146
0147     return l;
0148 }
0149 #endif
0150
0151 QStringList KStringHandler::perlSplit(const QRegularExpression &sep, const QString &str, int max)
0152 {
0153     // nothing to split
0154     if (str.isEmpty()) {
0155         return QStringList();
0156     }
0157
0158     const bool ignoreMax = max == 0;
0159
0160     QStringList list;
0161
0162     int start = 0;
0163
0164     const QStringView strView(str);
0165
0166     QRegularExpression separator(sep);
0167     separator.setPatternOptions(QRegularExpression::UseUnicodePropertiesOption);
0168
0169     QRegularExpressionMatchIterator iter = separator.globalMatch(strView);
0170     QRegularExpressionMatch match;
0171     while (iter.hasNext() && (ignoreMax || list.count() < max - 1)) {
0172         match = iter.next();
0173         const QStringView chunk = strView.mid(start, match.capturedStart() - start);
0174         if (!chunk.isEmpty()) {
0175             list.append(chunk.toString());
0176         }
0177
0178         start = match.capturedEnd();
0179     }
0180
0181     // catch the remainder
0182     const QStringView lastChunk = strView.mid(start, strView.size() - start);
0183     if (!lastChunk.isEmpty()) {
0184         list.append(lastChunk.toString());
0185     }
0186
0187     return list;
0188 }
0189
0190 QString KStringHandler::tagUrls(const QString &text)
0191 {
0192     QString richText(text);
0193
0194     static const QRegularExpression urlEx(QStringLiteral(R"((www\.(?!\.)|(fish|ftp|http|https)://[\d\w./,:_~?=&;#@\-+%$()]+))"),
0195                                           QRegularExpression::UseUnicodePropertiesOption);
0196     // The reference \1 is going to be replaced by the matched url
0197     richText.replace(urlEx, QStringLiteral("<a href=\"\\1\">\\1</a>"));
0198     return richText;
0199 }
0200
0201 QString KStringHandler::obscure(const QString &str)
0202 {
0203     QString result;
0204     for (const QChar ch : str) {
0205         // yes, no typo. can't encode ' ' or '!' because
0206         // they're the unicode BOM. stupid scrambling. stupid.
0207         const ushort uc = ch.unicode();
0208         result += (uc <= 0x21) ? ch : QChar(0x1001F - uc);
0209     }
0210
0211     return result;
0212 }
0213
0214 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 112)
0215 bool KStringHandler::isUtf8(const char *buf)
0216 {
0217     int i;
0218     int n;
0219     unsigned char c;
0220     bool gotone = false;
0221
0222     if (!buf) {
0223         return true; // whatever, just don't crash
0224     }
0225
0226 #define F 0 /* character never appears in text */
0227 #define T 1 /* character appears in plain ASCII text */
0228 #define I 2 /* character appears in ISO-8859 text */
0229 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
0230     /* clang-format off */
0231     static const unsigned char text_chars[256] = {
0232         /*                  BEL BS HT LF    FF CR    */
0233         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
0234         /*                              ESC          */
0235         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
0236         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
0237         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
0238         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
0239         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
0240         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
0241         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
0242         /*            NEL                            */
0243         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
0244         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
0245         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
0246         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
0247         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
0248         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
0249         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
0250         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
0251     };
0252     /* clang-format on */
0253
0254     /* *ulen = 0; */
0255     for (i = 0; (c = buf[i]); ++i) {
0256         if ((c & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
0257             /*
0258              * Even if the whole file is valid UTF-8 sequences,
0259              * still reject it if it uses weird control characters.
0260              */
0261
0262             if (text_chars[c] != T) {
0263                 return false;
0264             }
0265
0266         } else if ((c & 0x40) == 0) { /* 10xxxxxx never 1st byte */
0267             return false;
0268         } else { /* 11xxxxxx begins UTF-8 */
0269             int following;
0270
0271             if ((c & 0x20) == 0) { /* 110xxxxx */
0272                 following = 1;
0273             } else if ((c & 0x10) == 0) { /* 1110xxxx */
0274                 following = 2;
0275             } else if ((c & 0x08) == 0) { /* 11110xxx */
0276                 following = 3;
0277             } else if ((c & 0x04) == 0) { /* 111110xx */
0278                 following = 4;
0279             } else if ((c & 0x02) == 0) { /* 1111110x */
0280                 following = 5;
0281             } else {
0282                 return false;
0283             }
0284
0285             for (n = 0; n < following; ++n) {
0286                 i++;
0287                 if (!(c = buf[i])) {
0288                     goto done;
0289                 }
0290
0291                 if ((c & 0x80) == 0 || (c & 0x40)) {
0292                     return false;
0293                 }
0294             }
0295             gotone = true;
0296         }
0297     }
0298 done:
0299     return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
0300 }
0301
0302 #undef F
0303 #undef T
0304 #undef I
0305 #undef X
0306
0307 #endif
0308
0309 #if KCOREADDONS_BUILD_DEPRECATED_SINCE(5, 112)
0310 QString KStringHandler::from8Bit(const char *str)
0311 {
0312     if (!str) {
0313         return QString();
0314     }
0315     if (!*str) {
0316         static const QLatin1String emptyString("");
0317         return emptyString;
0318     }
0319     return KStringHandler::isUtf8(str) ? QString::fromUtf8(str) : QString::fromLocal8Bit(str);
0320 }
0321 #endif
0322
0323 QString KStringHandler::preProcessWrap(const QString &text)
0324 {
0325     const QChar zwsp(0x200b);
0326
0327     QString result;
0328     result.reserve(text.length());
0329
0330     for (int i = 0; i < text.length(); i++) {
0331         const QChar c = text[i];
0332         const bool openingParens = (c == QLatin1Char('(') || c == QLatin1Char('{') || c == QLatin1Char('['));
0333         const bool singleQuote = (c == QLatin1Char('\''));
0334         const bool closingParens = (c == QLatin1Char(')') || c == QLatin1Char('}') || c == QLatin1Char(']'));
0335         const bool breakAfter = (closingParens || c.isPunct() || c.isSymbol());
0336         const bool isLastChar = i == (text.length() - 1);
0337         const bool isLower = c.isLower();
0338         const bool nextIsUpper = !isLastChar && text[i + 1].isUpper(); // false by default
0339         const bool nextIsSpace = isLastChar || text[i + 1].isSpace(); // true by default
0340         const bool prevIsSpace = (i == 0 || text[i - 1].isSpace() || result[result.length() - 1] == zwsp);
0341
0342         // Provide a breaking opportunity before opening parenthesis
0343         if (openingParens && !prevIsSpace) {
0344             result += zwsp;
0345         }
0346
0347         // Provide a word joiner before the single quote
0348         if (singleQuote && !prevIsSpace) {
0349             result += QChar(0x2060);
0350         }
0351
0352         result += c;
0353
0354         // Provide a breaking opportunity between camelCase and PascalCase sub-words
0355         const bool isCamelCase = isLower && nextIsUpper;
0356
0357         if (isCamelCase || (breakAfter && !openingParens && !nextIsSpace && !singleQuote)) {
0358             result += zwsp;
0359         }
0360     }
0361
0362     return result;
0363 }
0364
0365 int KStringHandler::logicalLength(const QString &text)
0366 {
0367     int length = 0;
0368     const auto chrs = text.toUcs4();
0369     for (const auto chr : chrs) {
0370         const auto script = QChar::script(chr);
0371         /* clang-format off */
0372         if (script == QChar::Script_Han
0373             || script == QChar::Script_Hangul
0374             || script == QChar::Script_Hiragana
0375             || script == QChar::Script_Katakana
0376             || script == QChar::Script_Yi
0377             || QChar::isHighSurrogate(chr)) { /* clang-format on */
0378             length += 2;
0379         } else {
0380             length += 1;
0381         }
0382     }
0383     return length;
0384 }