File indexing completed on 2024-04-28 15:34:19
0001 /* This file is part of the KDE libraries 0002 0003 SPDX-FileCopyrightText: 2004 Zack Rusin <zack@kde.org> 0004 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> 0005 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> 0006 0007 SPDX-License-Identifier: LGPL-2.0-or-later 0008 */ 0009 0010 #include <QList> 0011 #include <QString> 0012 0013 #include "textbreaks_p.h" 0014 #include "tokenizer_p.h" 0015 0016 namespace Sonnet 0017 { 0018 class BreakTokenizerPrivate 0019 { 0020 public: 0021 enum Type { 0022 Words, 0023 Sentences, 0024 }; 0025 0026 BreakTokenizerPrivate(Type s) 0027 : breakFinder(new TextBreaks) 0028 , itemPosition(-1) 0029 , cacheValid(false) 0030 , type(s) 0031 { 0032 } 0033 0034 ~BreakTokenizerPrivate() 0035 { 0036 delete breakFinder; 0037 } 0038 0039 TextBreaks::Positions breaks() const; 0040 void invalidate(); 0041 void shiftBreaks(int from, int offset); 0042 void replace(int pos, int len, const QString &newWord); 0043 0044 TextBreaks *const breakFinder; 0045 QString buffer; 0046 0047 int itemPosition = -1; 0048 mutable bool cacheValid; 0049 Token last; 0050 const Type type; 0051 bool inAddress = false; 0052 bool ignoreUppercase = false; 0053 0054 bool hasNext() const; 0055 Token next(); 0056 void setBuffer(const QString &b) 0057 { 0058 invalidate(); 0059 buffer = b; 0060 } 0061 0062 private: 0063 void regenerateCache() const; 0064 mutable TextBreaks::Positions cachedBreaks; 0065 }; 0066 0067 void BreakTokenizerPrivate::invalidate() 0068 { 0069 cacheValid = false; 0070 itemPosition = -1; 0071 } 0072 0073 bool BreakTokenizerPrivate::hasNext() const 0074 { 0075 if (itemPosition >= (breaks().size() - 1)) { 0076 return false; 0077 } 0078 0079 return true; 0080 } 0081 0082 TextBreaks::Positions BreakTokenizerPrivate::breaks() const 0083 { 0084 if (!cacheValid) { 0085 regenerateCache(); 0086 } 0087 0088 return cachedBreaks; 0089 } 0090 0091 void BreakTokenizerPrivate::shiftBreaks(int from, int offset) 0092 { 0093 for (int i = 0; i < cachedBreaks.size(); i++) { 0094 if (cachedBreaks[i].start > from) { 0095 cachedBreaks[i].start = cachedBreaks[i].start - offset; 0096 } 0097 } 0098 } 0099 0100 void BreakTokenizerPrivate::regenerateCache() const 0101 { 0102 if (!breakFinder || buffer.isEmpty()) { 0103 cachedBreaks = TextBreaks::Positions(); 0104 } 0105 0106 if (breakFinder) { 0107 breakFinder->setText(buffer); 0108 0109 if (type == Sentences) { 0110 cachedBreaks = breakFinder->sentenceBreaks(); 0111 } else if (type == Words) { 0112 cachedBreaks = breakFinder->wordBreaks(); 0113 } 0114 } 0115 0116 cacheValid = true; 0117 } 0118 0119 Token BreakTokenizerPrivate::next() 0120 { 0121 Token block; 0122 0123 if (!hasNext()) { 0124 last = block; 0125 return block; 0126 } 0127 0128 itemPosition++; 0129 0130 const TextBreaks::Position &textBreak = this->breaks().at(itemPosition); 0131 QStringView token = QStringView(buffer).mid(textBreak.start, textBreak.length); 0132 last = {token, textBreak.start}; 0133 return last; 0134 } 0135 0136 void BreakTokenizerPrivate::replace(int pos, int len, const QString &newWord) 0137 { 0138 buffer.replace(pos, len, newWord); 0139 int offset = len - newWord.length(); 0140 if (cacheValid) { 0141 shiftBreaks(pos, offset); 0142 } 0143 } 0144 0145 /*-----------------------------------------------------------*/ 0146 0147 WordTokenizer::WordTokenizer(const QString &buffer) 0148 : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Words)) 0149 { 0150 setBuffer(buffer); 0151 } 0152 0153 WordTokenizer::~WordTokenizer() 0154 { 0155 delete d; 0156 } 0157 0158 bool WordTokenizer::hasNext() const 0159 { 0160 return d->hasNext(); 0161 } 0162 0163 void WordTokenizer::setBuffer(const QString &buffer) 0164 { 0165 d->setBuffer(buffer); 0166 } 0167 0168 Token WordTokenizer::next() 0169 { 0170 Token n = d->next(); 0171 0172 // end of address of url? 0173 if (d->inAddress && n.position() > 0 && d->buffer[n.position() - 1].isSpace()) { 0174 d->inAddress = false; 0175 } 0176 0177 // check if this word starts an email address of url 0178 if (!d->inAddress || hasNext()) { 0179 const int pos = n.position() + n.length(); 0180 if ((pos < d->buffer.length()) && d->buffer[pos] == QLatin1Char('@')) { 0181 d->inAddress = true; 0182 } 0183 if ((pos + 2 < d->buffer.length()) && d->buffer[pos] == QLatin1Char(':') && d->buffer[pos + 1] == QLatin1Char('/') 0184 && d->buffer[pos + 2] == QLatin1Char('/')) { 0185 d->inAddress = true; 0186 } 0187 } 0188 return n; 0189 } 0190 0191 QString WordTokenizer::buffer() const 0192 { 0193 return d->buffer; 0194 } 0195 0196 bool WordTokenizer::isUppercase(QStringView word) const 0197 { 0198 for (int i = 0; i < word.length(); ++i) { 0199 if (word.at(i).isLetter() && !word.at(i).isUpper()) { 0200 return false; 0201 } 0202 } 0203 return true; 0204 } 0205 0206 void WordTokenizer::setIgnoreUppercase(bool val) 0207 { 0208 d->ignoreUppercase = val; 0209 } 0210 0211 void WordTokenizer::replace(int pos, int len, const QString &newWord) 0212 { 0213 d->replace(pos, len, newWord); 0214 } 0215 0216 bool WordTokenizer::isSpellcheckable() const 0217 { 0218 if (d->last.isNull() || d->last.isEmpty()) { 0219 return false; 0220 } 0221 if (!d->last.at(0).isLetter()) { 0222 return false; 0223 } 0224 if (d->inAddress) { 0225 return false; 0226 } 0227 if (d->ignoreUppercase && isUppercase(d->last.token)) { 0228 return false; 0229 } 0230 return true; 0231 } 0232 0233 /* --------------------------------------------------------------------*/ 0234 0235 SentenceTokenizer::SentenceTokenizer(const QString &buffer) 0236 : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Sentences)) 0237 { 0238 setBuffer(buffer); 0239 } 0240 0241 SentenceTokenizer::~SentenceTokenizer() 0242 { 0243 delete d; 0244 } 0245 0246 bool SentenceTokenizer::hasNext() const 0247 { 0248 return d->hasNext(); 0249 } 0250 0251 void SentenceTokenizer::setBuffer(const QString &buffer) 0252 { 0253 d->setBuffer(buffer); 0254 } 0255 0256 Token SentenceTokenizer::next() 0257 { 0258 return d->next(); 0259 } 0260 0261 QString SentenceTokenizer::buffer() const 0262 { 0263 return d->buffer; 0264 } 0265 0266 void SentenceTokenizer::replace(int pos, int len, const QString &newWord) 0267 { 0268 d->replace(pos, len, newWord); 0269 } 0270 }