File indexing completed on 2024-04-21 04:00:55
0001 /* This file is part of the KDE libraries 0002 0003 SPDX-FileCopyrightText: 2009 Jakub Stachowski <qbast@go2.pl> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #ifndef ABSTRACTTOKENIZER_H 0009 #define ABSTRACTTOKENIZER_H 0010 0011 #include "sonnetcore_export.h" 0012 #include <QString> 0013 0014 #include <memory> 0015 0016 namespace Sonnet 0017 { 0018 struct Token { 0019 QStringView token = nullptr; 0020 int positionInBuffer = -1; 0021 0022 QString toString() const 0023 { 0024 return token.toString(); 0025 } 0026 0027 /** 0028 * @brief length of this token 0029 */ 0030 Q_DECL_CONSTEXPR int length() const 0031 { 0032 return token.size(); 0033 } 0034 0035 /** 0036 * @brief position in buffer of which the @ref token is a view 0037 */ 0038 Q_DECL_CONSTEXPR int position() const 0039 { 0040 return positionInBuffer; 0041 } 0042 0043 Q_DECL_CONSTEXPR bool isNull() const 0044 { 0045 return token.isNull(); 0046 } 0047 0048 Q_DECL_CONSTEXPR bool isEmpty() const 0049 { 0050 return token.isEmpty(); 0051 } 0052 0053 Q_DECL_CONSTEXPR QChar at(qsizetype n) const 0054 { 0055 return token.at(n); 0056 } 0057 }; 0058 0059 /** 0060 * @short AbstractTokenizer breaks text into smaller pieces - words, sentences, paragraphs. 0061 * 0062 * AbstractTokenizer is an abstract class that must be subclassed to be used. It provides API modelled 0063 * after Java-style iterators. During tokenization buffer can be modified using provided replace() method. 0064 * 0065 * @since 4.3 0066 */ 0067 class AbstractTokenizer 0068 { 0069 public: 0070 virtual ~AbstractTokenizer() 0071 { 0072 } 0073 0074 /** 0075 * Sets text to tokenize. It also resets tokenizer state. 0076 */ 0077 virtual void setBuffer(const QString &buffer = QString()) = 0; 0078 /** 0079 * Returns true if there is another token available. 0080 * @return true if another token is available, false if not. 0081 */ 0082 virtual bool hasNext() const = 0; 0083 0084 /** 0085 * Returns next token or null QString if there is none 0086 */ 0087 virtual Token next() = 0; 0088 0089 /** Returns content of currently tokenized buffer*/ 0090 virtual QString buffer() const = 0; 0091 0092 /** 0093 * Replace part of text in current buffer. Always use this function instead of directly 0094 * changing data in underlying buffer or tokenizer's internal state may become inconsistent. 0095 */ 0096 virtual void replace(int position, int len, const QString &newWord) = 0; 0097 }; 0098 0099 class BreakTokenizerPrivate; 0100 0101 /** 0102 @short WordTokenizer splits supplied buffer into individual words. 0103 0104 WordTokenizer splits buffer into words according to rules from Unicode standard 5.1. 0105 If purpose is to check spelling, use isSpellcheckable() to determine if current word should be 0106 checked or ignored. 0107 0108 Usage example: 0109 0110 @code 0111 WordTokenizer t(buffer); 0112 Speller sp; 0113 while (t.hasNext()) { 0114 Token word=t.next(); 0115 if (!t.isSpellcheckable()) continue; 0116 qDebug() << word.toString() << " " << sp.isCorrect(word.toString()); 0117 } 0118 @endcode 0119 0120 This example checks spelling of given buffer 0121 * @since 4.3 0122 */ 0123 class SONNETCORE_EXPORT WordTokenizer : public AbstractTokenizer 0124 { 0125 public: 0126 /** 0127 * Constructor for word tokenizer 0128 * @param buffer 0129 */ 0130 WordTokenizer(const QString &buffer = QString()); 0131 ~WordTokenizer() override; 0132 0133 void setBuffer(const QString &buffer) override; 0134 bool hasNext() const override; 0135 Token next() override; 0136 QString buffer() const override; 0137 void replace(int position, int len, const QString &newWord) override; 0138 0139 /** Returns true if this word should be spell checked. This ignores email addresses, URLs and other things according to configuration */ 0140 bool isSpellcheckable() const; 0141 0142 /** If ignore uppercase is true, then any word containing only uppercase letters will be considered unsuitable for spell check */ 0143 void setIgnoreUppercase(bool val); 0144 0145 private: 0146 SONNETCORE_NO_EXPORT bool isUppercase(QStringView word) const; 0147 0148 private: 0149 std::unique_ptr<BreakTokenizerPrivate> const d; 0150 }; 0151 0152 /** 0153 @short SentenceTokenizer splits supplied buffer into individual sentences. 0154 0155 SentenceTokenizer splits buffer into sentences according to rules from Unicode standard 5.1. 0156 * @since 4.3 0157 */ 0158 class SONNETCORE_EXPORT SentenceTokenizer : public AbstractTokenizer 0159 { 0160 public: 0161 SentenceTokenizer(const QString &buffer = QString()); 0162 ~SentenceTokenizer() override; 0163 void setBuffer(const QString &buffer) override; 0164 bool hasNext() const override; 0165 Token next() override; 0166 QString buffer() const override; 0167 void replace(int position, int len, const QString &newWord) override; 0168 0169 private: 0170 std::unique_ptr<BreakTokenizerPrivate> const d; 0171 }; 0172 } 0173 #endif