File indexing completed on 2024-04-28 15:34:19
0001 /* This file is part of the KDE libraries 0002 0003 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #ifndef ABSTRACTTOKENIZER_H 0009 #define ABSTRACTTOKENIZER_H 0010 0011 #include "sonnetcore_export.h" 0012 #include <QString> 0013 0014 namespace Sonnet 0015 { 0016 struct Token { 0017 QStringView token = nullptr; 0018 int positionInBuffer = -1; 0019 0020 QString toString() const 0021 { 0022 return token.toString(); 0023 } 0024 0025 /** 0026 * @brief length of this token 0027 */ 0028 Q_DECL_CONSTEXPR int length() const 0029 { 0030 return token.size(); 0031 } 0032 0033 /** 0034 * @brief position in buffer of which the @ref token is a view 0035 */ 0036 Q_DECL_CONSTEXPR int position() const 0037 { 0038 return positionInBuffer; 0039 } 0040 0041 Q_DECL_CONSTEXPR bool isNull() const 0042 { 0043 return token.isNull(); 0044 } 0045 0046 Q_DECL_CONSTEXPR bool isEmpty() const 0047 { 0048 return token.isEmpty(); 0049 } 0050 0051 Q_DECL_CONSTEXPR QChar at(qsizetype n) const 0052 { 0053 return token.at(n); 0054 } 0055 }; 0056 0057 /** 0058 * @short AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs. 0059 * 0060 * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled 0061 * after Java-style iterators. During tokenization buffer can be modified using provided replace() method. 0062 * 0063 * @since 4.3 0064 */ 0065 class AbstractTokenizer 0066 { 0067 public: 0068 virtual ~AbstractTokenizer() 0069 { 0070 } 0071 0072 /** 0073 * Sets text to tokenize. It also resets tokenizer state. 0074 */ 0075 virtual void setBuffer(const QString &buffer = QString()) = 0; 0076 /** 0077 * Returns true if there is another token available. 0078 * @return true if another token is available, false if not. 0079 */ 0080 virtual bool hasNext() const = 0; 0081 0082 /** 0083 * Returns next token or null QString if there is none 0084 */ 0085 virtual Token next() = 0; 0086 0087 /** Returns content of currently tokenized buffer*/ 0088 virtual QString buffer() const = 0; 0089 0090 /** 0091 * Replace part of text in current buffer. Always use this function instead of directly 0092 * changing data in underlying buffer or tokenizer's internal state may become inconsistent. 0093 */ 0094 virtual void replace(int position, int len, const QString &newWord) = 0; 0095 }; 0096 0097 class BreakTokenizerPrivate; 0098 0099 /** 0100 @short WordTokenizer splits supplied buffer into individual words. 0101 0102 WordTokenizer splits buffer into words according to rules from Unicode standard 5.1. 0103 If purpose is to check spelling, use isSpellcheckable() to determine if current word should be 0104 checked or ignored. 0105 0106 Usage example: 0107 0108 @code 0109 WordTokenizer t(buffer); 0110 Speller sp; 0111 while (t.hasNext()) { 0112 Token word=t.next(); 0113 if (!t.isSpellcheckable()) continue; 0114 qDebug() << word.toString() << " " << sp.isCorrect(word.toString()); 0115 } 0116 @endcode 0117 0118 This example checks spelling of given buffer 0119 * @since 4.3 0120 */ 0121 class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer 0122 { 0123 public: 0124 /** 0125 * Constructor for word tokenizer 0126 * @param buffer 0127 */ 0128 WordTokenizer(const QString &buffer = QString()); 0129 ~WordTokenizer() override; 0130 0131 void setBuffer(const QString &buffer) override; 0132 bool hasNext() const override; 0133 Token next() override; 0134 QString buffer() const override; 0135 void replace(int position, int len, const QString &newWord) override; 0136 0137 /** Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */ 0138 bool isSpellcheckable() const; 0139 0140 /** If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */ 0141 void setIgnoreUppercase(bool val); 0142 0143 private: 0144 SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const; 0145 BreakTokenizerPrivate *const d; 0146 }; 0147 0148 /** 0149 @short SentenceTokenizer splits supplied buffer into individual sentences. 0150 0151 SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1. 0152 * @since 4.3 0153 */ 0154 class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer 0155 { 0156 public: 0157 SentenceTokenizer(const QString &buffer = QString()); 0158 ~SentenceTokenizer() override; 0159 void setBuffer(const QString &buffer) override; 0160 bool hasNext() const override; 0161 Token next() override; 0162 QString buffer() const override; 0163 void replace(int position, int len, const QString &newWord) override; 0164 0165 private: 0166 BreakTokenizerPrivate *const d; 0167 }; 0168 } 0169 #endif