File indexing completed on 2024-04-21 04:00:55
0001 /* This file is part of the KDE libraries 0002 0003 SPDX-FileCopyrightText: 2004 Zack Rusin <zack@kde.org> 0004 SPDX-FileCopyrightText: 2006 Jacob R Rideout <kde@jacobrideout.net> 0005 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> 0006 0007 SPDX-License-Identifier: LGPL-2.0-or-later 0008 */ 0009 0010 #include <QList> 0011 #include <QString> 0012 0013 #include "textbreaks_p.h" 0014 #include "tokenizer_p.h" 0015 0016 namespace Sonnet 0017 { 0018 class BreakTokenizerPrivate 0019 { 0020 public: 0021 enum Type { 0022 Words, 0023 Sentences, 0024 }; 0025 0026 BreakTokenizerPrivate(Type s) 0027 : breakFinder(new TextBreaks) 0028 , itemPosition(-1) 0029 , cacheValid(false) 0030 , type(s) 0031 { 0032 } 0033 0034 ~BreakTokenizerPrivate() 0035 { 0036 delete breakFinder; 0037 } 0038 0039 TextBreaks::Positions breaks() const; 0040 void invalidate(); 0041 void shiftBreaks(int from, int offset); 0042 void replace(int pos, int len, const QString &newWord); 0043 0044 TextBreaks *const breakFinder; 0045 QString buffer; 0046 0047 int itemPosition = -1; 0048 mutable bool cacheValid; 0049 Token last; 0050 const Type type; 0051 bool inAddress = false; 0052 bool ignoreUppercase = false; 0053 0054 bool hasNext() const; 0055 Token next(); 0056 void setBuffer(const QString &b) 0057 { 0058 invalidate(); 0059 buffer = b; 0060 } 0061 0062 private: 0063 void regenerateCache() const; 0064 mutable TextBreaks::Positions cachedBreaks; 0065 }; 0066 0067 void BreakTokenizerPrivate::invalidate() 0068 { 0069 cacheValid = false; 0070 itemPosition = -1; 0071 } 0072 0073 bool BreakTokenizerPrivate::hasNext() const 0074 { 0075 if (itemPosition >= (breaks().size() - 1)) { 0076 return false; 0077 } 0078 0079 return true; 0080 } 0081 0082 TextBreaks::Positions BreakTokenizerPrivate::breaks() const 0083 { 0084 if (!cacheValid) { 0085 regenerateCache(); 0086 } 0087 0088 return cachedBreaks; 0089 } 0090 0091 void BreakTokenizerPrivate::shiftBreaks(int from, int offset) 0092 { 0093 for (int i = 0; i < cachedBreaks.size(); i++) { 0094 if (cachedBreaks[i].start > from) { 0095 cachedBreaks[i].start = cachedBreaks[i].start - offset; 0096 } 0097 } 0098 } 0099 0100 void BreakTokenizerPrivate::regenerateCache() const 0101 { 0102 if (!breakFinder || buffer.isEmpty()) { 0103 cachedBreaks = TextBreaks::Positions(); 0104 } 0105 0106 if (breakFinder) { 0107 breakFinder->setText(buffer); 0108 0109 if (type == Sentences) { 0110 cachedBreaks = breakFinder->sentenceBreaks(); 0111 } else if (type == Words) { 0112 cachedBreaks = breakFinder->wordBreaks(); 0113 } 0114 } 0115 0116 cacheValid = true; 0117 } 0118 0119 Token BreakTokenizerPrivate::next() 0120 { 0121 Token block; 0122 0123 if (!hasNext()) { 0124 last = block; 0125 return block; 0126 } 0127 0128 itemPosition++; 0129 0130 const TextBreaks::Positions breaks = this->breaks(); 0131 const TextBreaks::Position &textBreak = breaks.at(itemPosition); 0132 QStringView token = QStringView(buffer).mid(textBreak.start, textBreak.length); 0133 last = {token, textBreak.start}; 0134 return last; 0135 } 0136 0137 void BreakTokenizerPrivate::replace(int pos, int len, const QString &newWord) 0138 { 0139 buffer.replace(pos, len, newWord); 0140 int offset = len - newWord.length(); 0141 if (cacheValid) { 0142 shiftBreaks(pos, offset); 0143 } 0144 } 0145 0146 /*-----------------------------------------------------------*/ 0147 0148 WordTokenizer::WordTokenizer(const QString &buffer) 0149 : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Words)) 0150 { 0151 setBuffer(buffer); 0152 } 0153 0154 WordTokenizer::~WordTokenizer() = default; 0155 0156 bool WordTokenizer::hasNext() const 0157 { 0158 return d->hasNext(); 0159 } 0160 0161 void WordTokenizer::setBuffer(const QString &buffer) 0162 { 0163 d->setBuffer(buffer); 0164 } 0165 0166 Token WordTokenizer::next() 0167 { 0168 Token n = d->next(); 0169 0170 // end of address of url? 0171 if (d->inAddress && n.position() > 0 && d->buffer[n.position() - 1].isSpace()) { 0172 d->inAddress = false; 0173 } 0174 0175 // check if this word starts an email address of url 0176 if (!d->inAddress || hasNext()) { 0177 const int pos = n.position() + n.length(); 0178 if ((pos < d->buffer.length()) && d->buffer[pos] == QLatin1Char('@')) { 0179 d->inAddress = true; 0180 } 0181 if ((pos + 2 < d->buffer.length()) && d->buffer[pos] == QLatin1Char(':') && d->buffer[pos + 1] == QLatin1Char('/') 0182 && d->buffer[pos + 2] == QLatin1Char('/')) { 0183 d->inAddress = true; 0184 } 0185 } 0186 return n; 0187 } 0188 0189 QString WordTokenizer::buffer() const 0190 { 0191 return d->buffer; 0192 } 0193 0194 bool WordTokenizer::isUppercase(QStringView word) const 0195 { 0196 for (int i = 0; i < word.length(); ++i) { 0197 if (word.at(i).isLetter() && !word.at(i).isUpper()) { 0198 return false; 0199 } 0200 } 0201 return true; 0202 } 0203 0204 void WordTokenizer::setIgnoreUppercase(bool val) 0205 { 0206 d->ignoreUppercase = val; 0207 } 0208 0209 void WordTokenizer::replace(int pos, int len, const QString &newWord) 0210 { 0211 d->replace(pos, len, newWord); 0212 } 0213 0214 bool WordTokenizer::isSpellcheckable() const 0215 { 0216 if (d->last.isNull() || d->last.isEmpty()) { 0217 return false; 0218 } 0219 if (!d->last.at(0).isLetter()) { 0220 return false; 0221 } 0222 if (d->inAddress) { 0223 return false; 0224 } 0225 if (d->ignoreUppercase && isUppercase(d->last.token)) { 0226 return false; 0227 } 0228 return true; 0229 } 0230 0231 /* --------------------------------------------------------------------*/ 0232 0233 SentenceTokenizer::SentenceTokenizer(const QString &buffer) 0234 : d(new BreakTokenizerPrivate(BreakTokenizerPrivate::Sentences)) 0235 { 0236 setBuffer(buffer); 0237 } 0238 0239 SentenceTokenizer::~SentenceTokenizer() = default; 0240 0241 bool SentenceTokenizer::hasNext() const 0242 { 0243 return d->hasNext(); 0244 } 0245 0246 void SentenceTokenizer::setBuffer(const QString &buffer) 0247 { 0248 d->setBuffer(buffer); 0249 } 0250 0251 Token SentenceTokenizer::next() 0252 { 0253 return d->next(); 0254 } 0255 0256 QString SentenceTokenizer::buffer() const 0257 { 0258 return d->buffer; 0259 } 0260 0261 void SentenceTokenizer::replace(int pos, int len, const QString &newWord) 0262 { 0263 d->replace(pos, len, newWord); 0264 } 0265 }